In this post, I am going to show you a Java class using that we can get all URLs from a website URL like a web crawler.
Here goes the Java Class
package client; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Enumeration; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class GetAllUrls { public GetAllUrls() { super(); } public static void main(String[] args) { final String name = "http://www.awasthiashish.com"; Reader r = null; try { URL u = new URL(name); InputStream in = u.openStream(); r = new InputStreamReader(in); ParserDelegator hp = new ParserDelegator(); hp.parse(r, new HTMLEditorKit.ParserCallback() { public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t == HTML.Tag.A) { Enumeration attrNames = a.getAttributeNames(); while (attrNames.hasMoreElements()) { String exactUrl = ""; Object key = attrNames.nextElement(); if ("href".equals(key.toString())) { exactUrl = a.getAttribute(key).toString(); if (!a.getAttribute(key).toString().startsWith("http://") && !a.getAttribute(key).toString().startsWith("https://")) { if (a.getAttribute(key).toString().startsWith("/")) { exactUrl = name + a.getAttribute(key); } else { exactUrl = name.concat("/").concat(a.getAttribute(key).toString()); } URI uri; try { uri = new java.net.URI(exactUrl); System.out.println(uri); } catch (URISyntaxException e) { } } else { URI uri; try { uri = new java.net.URI(exactUrl); System.out.println(uri); } catch (URISyntaxException e) { } } } } } } }, true); } catch (MalformedURLException e) { } catch (IOException e) { } finally { if (r != null) { try { r.close(); } catch (IOException e) { } } } } }
and output on the console
Here you can see all the URLs linked to the home page of this website
Cheer 🙂 Happy Learning