Please disable your adblock and script blockers to view this page

Search this blog

Wednesday 5 December 2018

Get All URLs From a Website Using Java Code


In this post, I am going to show you a Java class using that we can get all URLs from a website URL like a web crawler.

Here goes the Java Class





  1. package client;
  2. import java.io.IOException;
  3. import java.io.InputStream;
  4. import java.io.InputStreamReader;
  5. import java.io.Reader;
  6. import java.net.MalformedURLException;
  7. import java.net.URI;
  8. import java.net.URISyntaxException;
  9. import java.net.URL;
  10. import java.util.Enumeration;
  11. import javax.swing.text.MutableAttributeSet;
  12. import javax.swing.text.html.HTML;
  13. import javax.swing.text.html.HTMLEditorKit;
  14. import javax.swing.text.html.parser.ParserDelegator;
  15. public class GetAllUrls {
  16. public GetAllUrls() {
  17. super();
  18. }
  19. public static void main(String[] args) {
  20. final String name = "http://www.awasthiashish.com";
  21. Reader r = null;
  22. try {
  23. URL u = new URL(name);
  24. InputStream in = u.openStream();
  25. r = new InputStreamReader(in);
  26. ParserDelegator hp = new ParserDelegator();
  27. hp.parse(r, new HTMLEditorKit.ParserCallback() {
  28. public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
  29. if (t == HTML.Tag.A) {
  30. Enumeration attrNames = a.getAttributeNames();
  31. while (attrNames.hasMoreElements()) {
  32. String exactUrl = "";
  33. Object key = attrNames.nextElement();
  34. if ("href".equals(key.toString())) {
  35. exactUrl = a.getAttribute(key).toString();
  36. if (!a.getAttribute(key).toString().startsWith("http://") &&
  37. !a.getAttribute(key).toString().startsWith("https://")) {
  38. if (a.getAttribute(key).toString().startsWith("/")) {
  39. exactUrl = name + a.getAttribute(key);
  40. } else {
  41. exactUrl = name.concat("/").concat(a.getAttribute(key).toString());
  42. }
  43. URI uri;
  44. try {
  45. uri = new java.net.URI(exactUrl);
  46. System.out.println(uri);
  47. } catch (URISyntaxException e) {
  48. }
  49. } else {
  50. URI uri;
  51. try {
  52. uri = new java.net.URI(exactUrl);
  53. System.out.println(uri);
  54. } catch (URISyntaxException e) {
  55. }
  56. }
  57. }
  58. }
  59. }
  60. }
  61. }, true);
  62. } catch (MalformedURLException e) {
  63. } catch (IOException e) {
  64. } finally {
  65. if (r != null) {
  66. try {
  67. r.close();
  68. } catch (IOException e) {
  69. }
  70. }
  71. }
  72. }
  73. }

and output on the console

Here you can see all the URLs linked to the home page of this website


Cheer 🙂 Happy Learning

No comments :

Post a Comment