From 0f96b746fcfefe73c7c0aa526d18bb9ddc4ea1db Mon Sep 17 00:00:00 2001 From: Muhammad Umer Farooq Date: Sun, 16 Feb 2020 11:34:54 +0500 Subject: [PATCH 1/8] Create emails_from_url.py --- web_programming/emails_from_url.py | 97 ++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 web_programming/emails_from_url.py diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py new file mode 100644 index 000000000000..5665e41df560 --- /dev/null +++ b/web_programming/emails_from_url.py @@ -0,0 +1,97 @@ +"""Get the site emails from URL.""" +__author__ = "Muhammad Umer Farooq" +__license__ = "MIT" +__version__ = "1.0.0" +__maintainer__ = "Muhammad Umer Farooq" +__email__ = "contact@muhammadumerfarooq.me" +__status__ = "Production" + +from html.parser import HTMLParser +import requests +import re +from urllib.parse import urlparse + +from urllib import parse + + +class Parser(HTMLParser): + + def __init__(self, domain): + HTMLParser.__init__(self) + self.data = [] + self.domain = domain + + def handle_starttag(self, tag, attrs): + # Only parse the 'anchor' tag. + if tag == "a": + # Check the list of defined attributes. + for name, value in attrs: + # If href is defined, and not empty nor # print it. + if name == "href" and value != "#" and value != '': + # If not already in data. + if value not in self.data: + url = parse.urljoin(self.domain, value) + self.data.append(url) + + +# Get main domain name (example.com) +def get_domain_name(url): + try: + u = get_sub_domain_name(url).split('.') + return u[-2] + '.' + u[-1] + except: + return "" + + +# Get sub domain name (sub.example.com) +def get_sub_domain_name(url): + try: + return urlparse(url).netloc + except: + return '' + +# Get the url +url = "https://github.com" +# Get the base domain from the url +domain = get_domain_name(url) + +# Initialize the parser +parser = Parser(domain) + +# Validate Email regx. +emailRegx = '[a-zA-Z0-9]+@' + domain +try: + # Open URL + r = requests.get(url) +except: + print("Please provide the valid url") + +# pass the raw HTML to the parser to get links +parser.feed(r.text) + +# Store Email Data structure. +Emails = [] +# Get links and loop through +for link in parser.data: + # open URL. + # read = requests.get(link) + try: + read = requests.get(link) + # Get the valid email. + email = re.findall(emailRegx, read.text) + # If not in list then append it. + if email not in Emails: + Emails.append(email) + except: + pass + +ValidEmails = [] + +# Remove duplicates email address. +for Email in Emails: + for e in Email: + if e not in ValidEmails: + ValidEmails.append(e) + +# Finally print list of email. +print(ValidEmails) From 090cd66ef2006c2f2ef6514aa05eb420e6fc63f4 Mon Sep 17 00:00:00 2001 From: vinayak Date: Tue, 25 Feb 2020 11:08:49 +0530 Subject: [PATCH 2/8] Update emails_from_url.py --- web_programming/emails_from_url.py | 88 +++++++++++++----------------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py index 5665e41df560..815c1dc7da50 100644 --- a/web_programming/emails_from_url.py +++ b/web_programming/emails_from_url.py @@ -6,16 +6,17 @@ __email__ = "contact@muhammadumerfarooq.me" __status__ = "Production" -from html.parser import HTMLParser -import requests import re -from urllib.parse import urlparse - +import requests from urllib import parse +from html.parser import HTMLParser class Parser(HTMLParser): - + ''' class use to parse HTML + handle_starttag function takes url from + anchor tag + ''' def __init__(self, domain): HTMLParser.__init__(self) self.data = [] @@ -36,62 +37,51 @@ def handle_starttag(self, tag, attrs): # Get main domain name (example.com) def get_domain_name(url): - try: - u = get_sub_domain_name(url).split('.') - return u[-2] + '.' + u[-1] - except: - return "" + return '.'.join(get_sub_domain_name(url).split('.')[-2:]) # Get sub domain name (sub.example.com) def get_sub_domain_name(url): - try: - return urlparse(url).netloc - except: - return '' - -# Get the url -url = "https://github.com" -# Get the base domain from the url -domain = get_domain_name(url) - -# Initialize the parser -parser = Parser(domain) - -# Validate Email regx. -emailRegx = '[a-zA-Z0-9]+@' + domain -try: + return parse.urlparse(url).netloc + +if __name__ == '__main__': + # Get the url + url = "https://github.com" + # Get the base domain from the url + domain = get_domain_name(url) + + # Initialize the parser + parser = Parser(domain) + + # Validate Email regx. + emailRegx = '[a-zA-Z0-9]+@' + domain + # Open URL r = requests.get(url) -except: - print("Please provide the valid url") - -# pass the raw HTML to the parser to get links -parser.feed(r.text) - -# Store Email Data structure. -Emails = [] -# Get links and loop through -for link in parser.data: - # open URL. - # read = requests.get(link) - try: + + # pass the raw HTML to the parser to get links + parser.feed(r.text) + + # Store Email Data structure. + Emails = [] + # Get links and loop through + for link in parser.data: + # open URL. + # read = requests.get(link) read = requests.get(link) # Get the valid email. email = re.findall(emailRegx, read.text) # If not in list then append it. if email not in Emails: Emails.append(email) - except: - pass -ValidEmails = [] + ValidEmails = [] -# Remove duplicates email address. -for Email in Emails: - for e in Email: - if e not in ValidEmails: - ValidEmails.append(e) + # Remove duplicates email address. + for Email in Emails: + for e in Email: + if e not in ValidEmails: + ValidEmails.append(e) -# Finally print list of email. -print(ValidEmails) + # Finally print list of email. + print(ValidEmails) From f86fd16ad65b64078e04bfdac46d72db11cdda84 Mon Sep 17 00:00:00 2001 From: vinayak Date: Tue, 25 Feb 2020 11:48:15 +0530 Subject: [PATCH 3/8] Update emails_from_url.py --- web_programming/emails_from_url.py | 34 +++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py index 815c1dc7da50..3a8aecfba510 100644 --- a/web_programming/emails_from_url.py +++ b/web_programming/emails_from_url.py @@ -13,22 +13,22 @@ class Parser(HTMLParser): - ''' class use to parse HTML - handle_starttag function takes url from - anchor tag - ''' def __init__(self, domain): HTMLParser.__init__(self) self.data = [] self.domain = domain def handle_starttag(self, tag, attrs): + """ + This function parse html to take takes url from + tags + """ # Only parse the 'anchor' tag. if tag == "a": # Check the list of defined attributes. for name, value in attrs: # If href is defined, and not empty nor # print it. - if name == "href" and value != "#" and value != '': + if name == "href" and value != "#" and value != "": # If not already in data. if value not in self.data: url = parse.urljoin(self.domain, value) @@ -37,14 +37,24 @@ def handle_starttag(self, tag, attrs): # Get main domain name (example.com) def get_domain_name(url): - return '.'.join(get_sub_domain_name(url).split('.')[-2:]) + """ + This function get the main domain name + """ + return ".".join(get_sub_domain_name(url).split(".")[-2:]) # Get sub domain name (sub.example.com) def get_sub_domain_name(url): - return parse.urlparse(url).netloc + """ + This function get sub domin name + """ + return parse.urlparse(url).netloc -if __name__ == '__main__': + +def emails_from_url(url: str = "https://github.com") -> list: + """ + This function takes url and return all valid urls + """ # Get the url url = "https://github.com" # Get the base domain from the url @@ -54,7 +64,7 @@ def get_sub_domain_name(url): parser = Parser(domain) # Validate Email regx. - emailRegx = '[a-zA-Z0-9]+@' + domain + emailRegx = "[a-zA-Z0-9]+@" + domain # Open URL r = requests.get(url) @@ -84,4 +94,8 @@ def get_sub_domain_name(url): ValidEmails.append(e) # Finally print list of email. - print(ValidEmails) + return ValidEmails + + +if __name__ == "__main__": + emails_from_url("https://github.com") From df2938dc3a399469c354f261e4d1122e81ce7170 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 26 Feb 2020 08:41:46 +0100 Subject: [PATCH 4/8] 0 emails found: --- web_programming/emails_from_url.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py index 3a8aecfba510..f31647c46741 100644 --- a/web_programming/emails_from_url.py +++ b/web_programming/emails_from_url.py @@ -4,12 +4,12 @@ __version__ = "1.0.0" __maintainer__ = "Muhammad Umer Farooq" __email__ = "contact@muhammadumerfarooq.me" -__status__ = "Production" +__status__ = "Alpha" import re import requests -from urllib import parse from html.parser import HTMLParser +from urllib import parse class Parser(HTMLParser): @@ -20,8 +20,7 @@ def __init__(self, domain): def handle_starttag(self, tag, attrs): """ - This function parse html to take takes url from - tags + This function parse html to take takes url from tags """ # Only parse the 'anchor' tag. if tag == "a": @@ -38,7 +37,7 @@ def handle_starttag(self, tag, attrs): # Get main domain name (example.com) def get_domain_name(url): """ - This function get the main domain name + This function get the main domain name """ return ".".join(get_sub_domain_name(url).split(".")[-2:]) @@ -46,7 +45,7 @@ def get_domain_name(url): # Get sub domain name (sub.example.com) def get_sub_domain_name(url): """ - This function get sub domin name + This function get sub domin name """ return parse.urlparse(url).netloc @@ -55,8 +54,6 @@ def emails_from_url(url: str = "https://github.com") -> list: """ This function takes url and return all valid urls """ - # Get the url - url = "https://github.com" # Get the base domain from the url domain = get_domain_name(url) @@ -98,4 +95,6 @@ def emails_from_url(url: str = "https://github.com") -> list: if __name__ == "__main__": - emails_from_url("https://github.com") + emails = emails_from_url("https://github.com") + print(f"{len(emails)} emails found:") + print("\n".join(sorted(emails))) From 13ec4e5b634c8ab24bb6fc6523e99998c68c8001 Mon Sep 17 00:00:00 2001 From: Muhammad Umer Farooq Date: Wed, 26 Feb 2020 15:08:34 +0500 Subject: [PATCH 5/8] Update emails_from_url.py --- web_programming/emails_from_url.py | 63 ++++++++++++++++-------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py index f31647c46741..cf1852010185 100644 --- a/web_programming/emails_from_url.py +++ b/web_programming/emails_from_url.py @@ -54,44 +54,47 @@ def emails_from_url(url: str = "https://github.com") -> list: """ This function takes url and return all valid urls """ + # Store Email Data structure. + emails = [] + valid_emails = [] + # Get the base domain from the url domain = get_domain_name(url) # Initialize the parser parser = Parser(domain) - # Validate Email regx. - emailRegx = "[a-zA-Z0-9]+@" + domain - - # Open URL - r = requests.get(url) - - # pass the raw HTML to the parser to get links - parser.feed(r.text) - - # Store Email Data structure. - Emails = [] - # Get links and loop through - for link in parser.data: - # open URL. - # read = requests.get(link) - read = requests.get(link) - # Get the valid email. - email = re.findall(emailRegx, read.text) - # If not in list then append it. - if email not in Emails: - Emails.append(email) - - ValidEmails = [] - - # Remove duplicates email address. - for Email in Emails: - for e in Email: - if e not in ValidEmails: - ValidEmails.append(e) + try: + # Open URL + r = requests.get(url) + + # pass the raw HTML to the parser to get links + parser.feed(r.text) + + # Get links and loop through + for link in parser.data: + # open URL. + # read = requests.get(link) + try: + read = requests.get(link) + # Get the valid email. + email = re.findall("[a-zA-Z0-9]+@" + domain, read.text) + # If not in list then append it. + if email not in emails: + emails.append(email) + except ValueError: + pass + + # Remove duplicates email address. + for Email in emails: + for e in Email: + if e not in valid_emails: + valid_emails.append(e) + except ValueError: + exit(-1) # Finally print list of email. - return ValidEmails + return valid_emails if __name__ == "__main__": From a0af0fd8c7eacd4bc5ce8b4221157c40c67c596a Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 26 Feb 2020 11:31:11 +0100 Subject: [PATCH 6/8] Use Python set() to remove duplicates --- web_programming/emails_from_url.py | 32 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py index cf1852010185..8d0384095a2e 100644 --- a/web_programming/emails_from_url.py +++ b/web_programming/emails_from_url.py @@ -7,10 +7,11 @@ __status__ = "Alpha" import re -import requests from html.parser import HTMLParser from urllib import parse +import requests + class Parser(HTMLParser): def __init__(self, domain): @@ -38,6 +39,11 @@ def handle_starttag(self, tag, attrs): def get_domain_name(url): """ This function get the main domain name + + >>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k") + 'c.d' + >>> get_domain_name("Not a URL!") + '' """ return ".".join(get_sub_domain_name(url).split(".")[-2:]) @@ -46,6 +52,11 @@ def get_domain_name(url): def get_sub_domain_name(url): """ This function get sub domin name + + >>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k") + 'a.b.c.d' + >>> get_sub_domain_name("Not a URL!") + '' """ return parse.urlparse(url).netloc @@ -54,10 +65,6 @@ def emails_from_url(url: str = "https://github.com") -> list: """ This function takes url and return all valid urls """ - # Store Email Data structure. - emails = [] - valid_emails = [] - # Get the base domain from the url domain = get_domain_name(url) @@ -72,29 +79,24 @@ def emails_from_url(url: str = "https://github.com") -> list: parser.feed(r.text) # Get links and loop through + valid_emails = set() for link in parser.data: # open URL. # read = requests.get(link) try: read = requests.get(link) # Get the valid email. - email = re.findall("[a-zA-Z0-9]+@" + domain, read.text) + emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text) # If not in list then append it. - if email not in emails: - emails.append(email) + for email in emails: + valid_emails.add(email) except ValueError: pass - - # Remove duplicates email address. - for Email in emails: - for e in Email: - if e not in valid_emails: - valid_emails.append(e) except ValueError: exit(-1) # Finally print list of email. - return valid_emails + return sorted(valid_emails) if __name__ == "__main__": From 47d15bbfd89c6cb72abc2665e2c8e9560361e5d9 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 26 Feb 2020 11:31:57 +0100 Subject: [PATCH 7/8] Update emails_from_url.py --- web_programming/emails_from_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py index 8d0384095a2e..ee138a646395 100644 --- a/web_programming/emails_from_url.py +++ b/web_programming/emails_from_url.py @@ -95,7 +95,7 @@ def emails_from_url(url: str = "https://github.com") -> list: except ValueError: exit(-1) - # Finally print list of email. + # Finally return a sorted list of email addresses with no duplicates. return sorted(valid_emails) From f0f0118a9805da1921e46e3ff6238fbfe2ed3a45 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 26 Feb 2020 11:34:35 +0100 Subject: [PATCH 8/8] Add type hints and doctests --- web_programming/emails_from_url.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py index ee138a646395..fba9f769bace 100644 --- a/web_programming/emails_from_url.py +++ b/web_programming/emails_from_url.py @@ -14,12 +14,12 @@ class Parser(HTMLParser): - def __init__(self, domain): + def __init__(self, domain: str): HTMLParser.__init__(self) self.data = [] self.domain = domain - def handle_starttag(self, tag, attrs): + def handle_starttag(self, tag: str, attrs: str) -> None: """ This function parse html to take takes url from tags """ @@ -36,7 +36,7 @@ def handle_starttag(self, tag, attrs): # Get main domain name (example.com) -def get_domain_name(url): +def get_domain_name(url: str) -> str: """ This function get the main domain name @@ -49,7 +49,7 @@ def get_domain_name(url): # Get sub domain name (sub.example.com) -def get_sub_domain_name(url): +def get_sub_domain_name(url: str) -> str: """ This function get sub domin name