1
+ package ch_21 ;
2
+
3
+ import java .net .URL ;
4
+ import java .util .*;
5
+
6
+ /**
7
+ * *21.14 (Web crawler) Rewrite Listing 12.18, WebCrawler.java,
8
+ * to improve the performance by using appropriate
9
+ * new data structures for listOfPendingURLs and listofTraversedURLs.
10
+ */
11
+ public class Exercise21_14 {
12
+ public static void main (String [] args ) {
13
+ java .util .Scanner input = new java .util .Scanner (System .in );
14
+ System .out .print ("Enter a URL: " );
15
+ String url = input .nextLine ();
16
+ crawler (url ); // Traverse the Web from the a starting url
17
+ }
18
+
19
+ public static void crawler (String startingURL ) {
20
+ /* Exercise 21.14
21
+ ...improve the performance by using appropriate new data structures for listOfPendingURLs and
22
+ listofTraversedURLs.
23
+ */
24
+ TreeSet <CrawlUrl > listOfPendingURLs = new TreeSet <>();
25
+ HashSet <CrawlUrl > listOfTraversedURLs = new HashSet <>();
26
+ listOfPendingURLs .add (new CrawlUrl (startingURL ));
27
+ while (!listOfPendingURLs .isEmpty () &&
28
+ listOfTraversedURLs .size () <= 100 ) {
29
+ CrawlUrl crawUrl = listOfPendingURLs .first ();
30
+ listOfPendingURLs .remove (crawUrl );
31
+ if (!listOfTraversedURLs .contains (crawUrl )) {
32
+ listOfTraversedURLs .add (crawUrl );
33
+ System .out .println ("Crawl: " + crawUrl );
34
+ for (CrawlUrl crawlUrl : getSubURLs (crawUrl .getUrlString ())) {
35
+ if (!listOfTraversedURLs .contains (crawlUrl ))
36
+ listOfPendingURLs .add (crawlUrl );
37
+ }
38
+ }
39
+ }
40
+ }
41
+
42
+ public static Set <CrawlUrl > getSubURLs (String urlString ) {
43
+ Set <CrawlUrl > set = new HashSet <>(); // Exercise 21.14
44
+ try {
45
+ URL url = new URL (urlString );
46
+ Scanner input = new Scanner (url .openStream ());
47
+ int current = 0 ;
48
+ while (input .hasNext ()) {
49
+ String line = input .nextLine ();
50
+ current = line .indexOf ("http:" , current );
51
+ while (current > 0 ) {
52
+ int endIndex = line .indexOf ("\" " , current );
53
+ if (endIndex > 0 ) { // Ensure that a correct URL is found
54
+ set .add (new CrawlUrl (line .substring (current , endIndex )));
55
+ current = line .indexOf ("http:" , endIndex );
56
+ } else
57
+ current = -1 ;
58
+ }
59
+ }
60
+ } catch (Exception ex ) {
61
+ System .out .println ("Error: " + ex .getMessage ());
62
+ }
63
+ return set ;
64
+ }
65
+ }
66
+
67
+ /**
68
+ * Implements Comparable for default sorting when used in a TreeSet
69
+ */
70
+ class CrawlUrl implements Comparable <CrawlUrl > {
71
+ private String urlString ;
72
+
73
+ public CrawlUrl (String urlString ) {
74
+ this .urlString = urlString ;
75
+ }
76
+
77
+ @ Override
78
+ public int compareTo (CrawlUrl that ) {
79
+ // Implement to sort by the length of the url
80
+ return Integer .compare (this .urlString .length (), that .urlString .length ());
81
+ }
82
+
83
+ public String getUrlString () {
84
+ return urlString ;
85
+ }
86
+
87
+ public CrawlUrl setUrlString (String urlString ) {
88
+ this .urlString = urlString ;
89
+ return this ;
90
+ }
91
+
92
+
93
+ @ Override
94
+ public String toString () {
95
+ return "CrawlUrl{" +
96
+ "urlString='" + urlString + '\'' +
97
+ '}' ;
98
+ }
99
+ }
0 commit comments