Get All URLs From a Website Using Java Code

Sharing is Caring

In this post, I am going to show you a Java class using that we can get all URLs from a website URL like a web crawler.

Here goes the Java Class

package client;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;

import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;

import java.util.Enumeration;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

public class GetAllUrls {
    public GetAllUrls() {
        super();
    }

    public static void main(String[] args) {
        final String name = "http://www.awasthiashish.com";
        Reader r = null;

        try {
            URL u = new URL(name);
            InputStream in = u.openStream();
            r = new InputStreamReader(in);

            ParserDelegator hp = new ParserDelegator();
            hp.parse(r, new HTMLEditorKit.ParserCallback() {

                public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {

                    if (t == HTML.Tag.A) {
                        Enumeration attrNames = a.getAttributeNames();

                        while (attrNames.hasMoreElements()) {
                            String exactUrl = "";
                            Object key = attrNames.nextElement();
                            if ("href".equals(key.toString())) {
                                exactUrl = a.getAttribute(key).toString();

                                if (!a.getAttribute(key).toString().startsWith("http://") &&
                                    !a.getAttribute(key).toString().startsWith("https://")) {

                                    if (a.getAttribute(key).toString().startsWith("/")) {
                                        exactUrl = name + a.getAttribute(key);
                                    } else {
                                        exactUrl = name.concat("/").concat(a.getAttribute(key).toString());
                                    }

                                    URI uri;

                                    try {
                                        uri = new java.net.URI(exactUrl);
                                        System.out.println(uri);
                                    } catch (URISyntaxException e) {
                                    }


                                } else {
                                    URI uri;
                                    try {
                                        uri = new java.net.URI(exactUrl);
                                        System.out.println(uri);
                                    } catch (URISyntaxException e) {
                                    }
                                }
                            }
                        }
                    }
                }
            }, true);
        } catch (MalformedURLException e) {
        } catch (IOException e) {
        } finally {

            if (r != null) {
                try {
                    r.close();
                } catch (IOException e) {
                }
            }
        }
    }
}

 

and output on the console

Here you can see all the URLs linked to the home page of this website

Get All Urls from a Website

Cheer 🙂 Happy Learning

Related Posts

An Oracle ACE, Blogger, Reviewer, Technical Lead working on Oracle ADF

Leave a Reply

Your email address will not be published. Required fields are marked *