|
1 | 1 | import requests |
2 | 2 | import re |
3 | | -try: |
4 | | - from urllib.parse import urljoin |
5 | | -except ImportError: |
6 | | - from urlparse import urljoin |
7 | 3 |
|
8 | | -# regex |
9 | | -email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') |
10 | | -link_re = re.compile(r'href="(.*?)"') |
| 4 | +#get url |
| 5 | +#url=input('Enter a URL (include 'http://'):')--this is wrong |
| 6 | +url = input('Enter a URL (include `http://`): ') |
11 | 7 |
|
12 | 8 |
|
13 | | -def crawl(url): |
| 9 | +#connect to the url |
| 10 | +website=requests.get(url) |
14 | 11 |
|
15 | | - result = set() |
| 12 | +#read html |
| 13 | +html=website.text |
16 | 14 |
|
17 | | - req = requests.get(url) |
18 | 15 |
|
19 | | - # Check if successful |
20 | | - if(req.status_code != 200): |
21 | | - return [] |
| 16 | +#use re.findall to grab all the links |
| 17 | +links = re.findall('"((http|ftp)s?://.*?)"', html) |
22 | 18 |
|
23 | | - # Find links |
24 | | - links = link_re.findall(req.text) |
| 19 | +emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html) |
25 | 20 |
|
26 | | - print("\nFound {} links".format(len(links))) |
27 | 21 |
|
28 | | - # Search links for emails |
29 | | - for link in links: |
| 22 | +#prints the number of links in the list |
| 23 | +print("\nFound {} links".format(len(links))) |
30 | 24 |
|
31 | | - # Get an absolute URL for a link |
32 | | - link = urljoin(url, link) |
33 | | - |
34 | | - # Find all emails on current page |
35 | | - result.update(email_re.findall(req.text)) |
36 | | - |
37 | | - return result |
38 | | - |
39 | | -if __name__ == '__main__': |
40 | | - emails = crawl('http://www.realpython.com') |
41 | | - |
42 | | - print("\nScrapped e-mail addresses:") |
43 | | - for email in emails: |
44 | | - print(email) |
45 | | - print("\n") |
| 25 | +for email in emails: |
| 26 | + print(email) |
0 commit comments