|
18 | 18 | # url = sys.argv[1] |
19 | 19 | url = "http://127.0.0.1/" |
20 | 20 | url = "http://gxxnr.cn/" |
21 | | -url = "http://www.cnblogs.com/qq78292959/archive/2013/04/07/3005763.html" |
| 21 | +url = "http://www.cnblogs.com/" |
22 | 22 | # parsedUrl = urlparse(url_str) |
23 | 23 |
|
24 | 24 | def urlParser(url): |
@@ -112,14 +112,6 @@ def hrefsFilter(links, domain): |
112 | 112 | # links = getAllSameFatherDomainLinks(links) # 获取所有子域名下的所有链接 |
113 | 113 | links = getAllSameSourceLinks(links) # 获取同源策略下的所有链接 |
114 | 114 | links = getAllQueryLinks(links) # 获取具有查询功能的URL |
115 | | - links.add("http://127.0.0.1/index.php?id=1") |
116 | | - links.add("http://127.0.0.1/index.php?id=2&id=1&fuck=dstemplink") |
117 | | - links.add("http://127.0.0.1/index.php?id=3") |
118 | | - links.add("http://127.0.0.1/index.php?id=4") |
119 | | - links.add("http://127.0.0.1/index.php?id=5") |
120 | | - links.add("http://127.0.0.1/index.php?ids=5") |
121 | | - links.add("http://127.0.0.1/indexs.php?is=1") |
122 | | - links.add("http://127.0.0.1/indexs.php?id=1") |
123 | 115 | links = getAllTrueQueryLinks(links) # 这个函数是为了防止 xxx.css?v=xxx 这种情况出现的 , 使用黑名单进行过滤 |
124 | 116 | links = analyseAllLinks(links) |
125 | 117 | links = mergeSameQuery(links) |
@@ -190,8 +182,7 @@ def mergeSameQuery(links): |
190 | 182 | return results |
191 | 183 |
|
192 | 184 | def main(): |
193 | | - # content = getContent(url) |
194 | | - content = "<html></html>" |
| 185 | + content = getContent(url) |
195 | 186 | soup = BeautifulSoup(content, "html.parser") |
196 | 187 | links = getAllLinks(soup) |
197 | 188 | hrefs = getAllHerfs(links) |
|
0 commit comments