From 0f96b746fcfefe73c7c0aa526d18bb9ddc4ea1db Mon Sep 17 00:00:00 2001
From: Muhammad Umer Farooq <mumerfarooqlablnet01@gmail.com>
Date: Sun, 16 Feb 2020 11:34:54 +0500
Subject: [PATCH 1/8] Create emails_from_url.py

---
 web_programming/emails_from_url.py | 97 ++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 web_programming/emails_from_url.py

diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py
new file mode 100644
index 000000000000..5665e41df560
--- /dev/null
+++ b/web_programming/emails_from_url.py
@@ -0,0 +1,97 @@
+"""Get the site emails from URL."""
+__author__ = "Muhammad Umer Farooq"
+__license__ = "MIT"
+__version__ = "1.0.0"
+__maintainer__ = "Muhammad Umer Farooq"
+__email__ = "contact@muhammadumerfarooq.me"
+__status__ = "Production"
+
+from html.parser import HTMLParser
+import requests
+import re
+from urllib.parse import urlparse
+
+from urllib import parse
+
+
+class Parser(HTMLParser):
+
+    def __init__(self, domain):
+        HTMLParser.__init__(self)
+        self.data = []
+        self.domain = domain
+
+    def handle_starttag(self, tag, attrs):
+        # Only parse the 'anchor' tag.
+        if tag == "a":
+            # Check the list of defined attributes.
+            for name, value in attrs:
+                # If href is defined, and not empty nor # print it.
+                if name == "href" and value != "#" and value != '':
+                    # If not already in data.
+                    if value not in self.data:
+                        url = parse.urljoin(self.domain, value)
+                        self.data.append(url)
+
+
+# Get main domain name (example.com)
+def get_domain_name(url):
+    try:
+        u = get_sub_domain_name(url).split('.')
+        return u[-2] + '.' + u[-1]
+    except:
+        return ""
+
+
+# Get sub domain name (sub.example.com)
+def get_sub_domain_name(url):
+    try:
+        return urlparse(url).netloc
+    except:
+        return ''
+
+# Get the url
+url = "https://github.com"
+# Get the base domain from the url
+domain = get_domain_name(url)
+
+# Initialize the parser
+parser = Parser(domain)
+
+# Validate Email regx.
+emailRegx = '[a-zA-Z0-9]+@' + domain
+try:
+    # Open URL
+    r = requests.get(url)
+except:
+    print("Please provide the valid url")
+
+# pass the raw HTML to the parser to get links
+parser.feed(r.text)
+
+# Store Email Data structure.
+Emails = []
+# Get links and loop through
+for link in parser.data:
+    # open URL.
+    # read = requests.get(link)
+    try:
+        read = requests.get(link)
+        # Get the valid email.
+        email = re.findall(emailRegx, read.text)
+        # If not in list then append it.
+        if email not in Emails:
+            Emails.append(email)
+    except:
+        pass
+
+ValidEmails = []
+
+# Remove duplicates email address.
+for Email in Emails:
+    for e in Email:
+        if e not in ValidEmails:
+            ValidEmails.append(e)
+
+# Finally print list of email.
+print(ValidEmails)

From 090cd66ef2006c2f2ef6514aa05eb420e6fc63f4 Mon Sep 17 00:00:00 2001
From: vinayak <itssvinayak@gmail.com>
Date: Tue, 25 Feb 2020 11:08:49 +0530
Subject: [PATCH 2/8] Update emails_from_url.py

---
 web_programming/emails_from_url.py | 88 +++++++++++++-----------------
 1 file changed, 39 insertions(+), 49 deletions(-)

diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py
index 5665e41df560..815c1dc7da50 100644
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@@ -6,16 +6,17 @@
 __email__ = "contact@muhammadumerfarooq.me"
 __status__ = "Production"
 
-from html.parser import HTMLParser
-import requests
 import re
-from urllib.parse import urlparse
-
+import requests
 from urllib import parse
+from html.parser import HTMLParser
 
 
 class Parser(HTMLParser):
-
+    ''' class use to parse HTML
+         handle_starttag  function takes url from
+         anchor tag
+    '''
     def __init__(self, domain):
         HTMLParser.__init__(self)
         self.data = []
@@ -36,62 +37,51 @@ def handle_starttag(self, tag, attrs):
 
 # Get main domain name (example.com)
 def get_domain_name(url):
-    try:
-        u = get_sub_domain_name(url).split('.')
-        return u[-2] + '.' + u[-1]
-    except:
-        return ""
+    return '.'.join(get_sub_domain_name(url).split('.')[-2:])
 
 
 # Get sub domain name (sub.example.com)
 def get_sub_domain_name(url):
-    try:
-        return urlparse(url).netloc
-    except:
-        return ''
-
-# Get the url
-url = "https://github.com"
-# Get the base domain from the url
-domain = get_domain_name(url)
-
-# Initialize the parser
-parser = Parser(domain)
-
-# Validate Email regx.
-emailRegx = '[a-zA-Z0-9]+@' + domain
-try:
+        return parse.urlparse(url).netloc
+
+if __name__ == '__main__':
+    # Get the url
+    url = "https://github.com"
+    # Get the base domain from the url
+    domain = get_domain_name(url)
+
+    # Initialize the parser
+    parser = Parser(domain)
+
+    # Validate Email regx.
+    emailRegx = '[a-zA-Z0-9]+@' + domain
+
     # Open URL
     r = requests.get(url)
-except:
-    print("Please provide the valid url")
-
-# pass the raw HTML to the parser to get links
-parser.feed(r.text)
-
-# Store Email Data structure.
-Emails = []
-# Get links and loop through
-for link in parser.data:
-    # open URL.
-    # read = requests.get(link)
-    try:
+
+    # pass the raw HTML to the parser to get links
+    parser.feed(r.text)
+
+    # Store Email Data structure.
+    Emails = []
+    # Get links and loop through
+    for link in parser.data:
+        # open URL.
+        # read = requests.get(link)
         read = requests.get(link)
         # Get the valid email.
         email = re.findall(emailRegx, read.text)
         # If not in list then append it.
         if email not in Emails:
             Emails.append(email)
-    except:
-        pass
 
-ValidEmails = []
+    ValidEmails = []
 
-# Remove duplicates email address.
-for Email in Emails:
-    for e in Email:
-        if e not in ValidEmails:
-            ValidEmails.append(e)
+    # Remove duplicates email address.
+    for Email in Emails:
+        for e in Email:
+            if e not in ValidEmails:
+                ValidEmails.append(e)
 
-# Finally print list of email.
-print(ValidEmails)
+    # Finally print list of email.
+    print(ValidEmails)

From f86fd16ad65b64078e04bfdac46d72db11cdda84 Mon Sep 17 00:00:00 2001
From: vinayak <itssvinayak@gmail.com>
Date: Tue, 25 Feb 2020 11:48:15 +0530
Subject: [PATCH 3/8] Update emails_from_url.py

---
 web_programming/emails_from_url.py | 34 +++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py
index 815c1dc7da50..3a8aecfba510 100644
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@@ -13,22 +13,22 @@
 
 
 class Parser(HTMLParser):
-    ''' class use to parse HTML
-         handle_starttag  function takes url from
-         anchor tag
-    '''
     def __init__(self, domain):
         HTMLParser.__init__(self)
         self.data = []
         self.domain = domain
 
     def handle_starttag(self, tag, attrs):
+        """
+             This function parse html to take takes url from
+              tags
+        """
         # Only parse the 'anchor' tag.
         if tag == "a":
             # Check the list of defined attributes.
             for name, value in attrs:
                 # If href is defined, and not empty nor # print it.
-                if name == "href" and value != "#" and value != '':
+                if name == "href" and value != "#" and value != "":
                     # If not already in data.
                     if value not in self.data:
                         url = parse.urljoin(self.domain, value)
@@ -37,14 +37,24 @@ def handle_starttag(self, tag, attrs):
 
 # Get main domain name (example.com)
 def get_domain_name(url):
-    return '.'.join(get_sub_domain_name(url).split('.')[-2:])
+    """
+      This function get the main domain name
+    """
+    return ".".join(get_sub_domain_name(url).split(".")[-2:])
 
 
 # Get sub domain name (sub.example.com)
 def get_sub_domain_name(url):
-        return parse.urlparse(url).netloc
+    """
+      This function get sub domin name
+    """
+    return parse.urlparse(url).netloc
 
-if __name__ == '__main__':
+
+def emails_from_url(url: str = "https://github.com") -> list:
+    """
+    This function takes url and return all valid urls
+    """
     # Get the url
     url = "https://github.com"
     # Get the base domain from the url
@@ -54,7 +64,7 @@ def get_sub_domain_name(url):
     parser = Parser(domain)
 
     # Validate Email regx.
-    emailRegx = '[a-zA-Z0-9]+@' + domain
+    emailRegx = "[a-zA-Z0-9]+@" + domain
 
     # Open URL
     r = requests.get(url)
@@ -84,4 +94,8 @@ def get_sub_domain_name(url):
                 ValidEmails.append(e)
 
     # Finally print list of email.
-    print(ValidEmails)
+    return ValidEmails
+
+
+if __name__ == "__main__":
+    emails_from_url("https://github.com")

From df2938dc3a399469c354f261e4d1122e81ce7170 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Wed, 26 Feb 2020 08:41:46 +0100
Subject: [PATCH 4/8] 0 emails found:

---
 web_programming/emails_from_url.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py
index 3a8aecfba510..f31647c46741 100644
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@@ -4,12 +4,12 @@
 __version__ = "1.0.0"
 __maintainer__ = "Muhammad Umer Farooq"
 __email__ = "contact@muhammadumerfarooq.me"
-__status__ = "Production"
+__status__ = "Alpha"
 
 import re
 import requests
-from urllib import parse
 from html.parser import HTMLParser
+from urllib import parse
 
 
 class Parser(HTMLParser):
@@ -20,8 +20,7 @@ def __init__(self, domain):
 
     def handle_starttag(self, tag, attrs):
         """
-             This function parse html to take takes url from
-              tags
+        This function parse html to take takes url from tags
         """
         # Only parse the 'anchor' tag.
         if tag == "a":
@@ -38,7 +37,7 @@ def handle_starttag(self, tag, attrs):
 # Get main domain name (example.com)
 def get_domain_name(url):
     """
-      This function get the main domain name
+    This function get the main domain name
     """
     return ".".join(get_sub_domain_name(url).split(".")[-2:])
 
@@ -46,7 +45,7 @@ def get_domain_name(url):
 # Get sub domain name (sub.example.com)
 def get_sub_domain_name(url):
     """
-      This function get sub domin name
+    This function get sub domin name
     """
     return parse.urlparse(url).netloc
 
@@ -55,8 +54,6 @@ def emails_from_url(url: str = "https://github.com") -> list:
     """
     This function takes url and return all valid urls
     """
-    # Get the url
-    url = "https://github.com"
     # Get the base domain from the url
     domain = get_domain_name(url)
 
@@ -98,4 +95,6 @@ def emails_from_url(url: str = "https://github.com") -> list:
 
 
 if __name__ == "__main__":
-    emails_from_url("https://github.com")
+    emails = emails_from_url("https://github.com")
+    print(f"{len(emails)} emails found:")
+    print("\n".join(sorted(emails)))

From 13ec4e5b634c8ab24bb6fc6523e99998c68c8001 Mon Sep 17 00:00:00 2001
From: Muhammad Umer Farooq <mumerfarooqlablnet01@gmail.com>
Date: Wed, 26 Feb 2020 15:08:34 +0500
Subject: [PATCH 5/8] Update emails_from_url.py

---
 web_programming/emails_from_url.py | 63 ++++++++++++++++--------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py
index f31647c46741..cf1852010185 100644
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@@ -54,44 +54,47 @@ def emails_from_url(url: str = "https://github.com") -> list:
     """
     This function takes url and return all valid urls
     """
+    # Store Email Data structure.
+    emails = []
+    valid_emails = []
+
     # Get the base domain from the url
     domain = get_domain_name(url)
 
     # Initialize the parser
     parser = Parser(domain)
 
-    # Validate Email regx.
-    emailRegx = "[a-zA-Z0-9]+@" + domain
-
-    # Open URL
-    r = requests.get(url)
-
-    # pass the raw HTML to the parser to get links
-    parser.feed(r.text)
-
-    # Store Email Data structure.
-    Emails = []
-    # Get links and loop through
-    for link in parser.data:
-        # open URL.
-        # read = requests.get(link)
-        read = requests.get(link)
-        # Get the valid email.
-        email = re.findall(emailRegx, read.text)
-        # If not in list then append it.
-        if email not in Emails:
-            Emails.append(email)
-
-    ValidEmails = []
-
-    # Remove duplicates email address.
-    for Email in Emails:
-        for e in Email:
-            if e not in ValidEmails:
-                ValidEmails.append(e)
+    try:
+        # Open URL
+        r = requests.get(url)
+
+        # pass the raw HTML to the parser to get links
+        parser.feed(r.text)
+
+        # Get links and loop through
+        for link in parser.data:
+            # open URL.
+            # read = requests.get(link)
+            try:
+                read = requests.get(link)
+                # Get the valid email.
+                email = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
+                # If not in list then append it.
+                if email not in emails:
+                    emails.append(email)
+            except ValueError:
+                pass
+
+        # Remove duplicates email address.
+        for Email in emails:
+            for e in Email:
+                if e not in valid_emails:
+                    valid_emails.append(e)
+    except ValueError:
+        exit(-1)
 
     # Finally print list of email.
-    return ValidEmails
+    return valid_emails
 
 
 if __name__ == "__main__":

From a0af0fd8c7eacd4bc5ce8b4221157c40c67c596a Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Wed, 26 Feb 2020 11:31:11 +0100
Subject: [PATCH 6/8] Use Python set() to remove duplicates

---
 web_programming/emails_from_url.py | 32 ++++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py
index cf1852010185..8d0384095a2e 100644
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@@ -7,10 +7,11 @@
 __status__ = "Alpha"
 
 import re
-import requests
 from html.parser import HTMLParser
 from urllib import parse
 
+import requests
+
 
 class Parser(HTMLParser):
     def __init__(self, domain):
@@ -38,6 +39,11 @@ def handle_starttag(self, tag, attrs):
 def get_domain_name(url):
     """
     This function get the main domain name
+
+    >>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
+    'c.d'
+    >>> get_domain_name("Not a URL!")
+    ''
     """
     return ".".join(get_sub_domain_name(url).split(".")[-2:])
 
@@ -46,6 +52,11 @@ def get_domain_name(url):
 def get_sub_domain_name(url):
     """
     This function get sub domin name
+
+    >>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
+    'a.b.c.d'
+    >>> get_sub_domain_name("Not a URL!")
+    ''
     """
     return parse.urlparse(url).netloc
 
@@ -54,10 +65,6 @@ def emails_from_url(url: str = "https://github.com") -> list:
     """
     This function takes url and return all valid urls
     """
-    # Store Email Data structure.
-    emails = []
-    valid_emails = []
-
     # Get the base domain from the url
     domain = get_domain_name(url)
 
@@ -72,29 +79,24 @@ def emails_from_url(url: str = "https://github.com") -> list:
         parser.feed(r.text)
 
         # Get links and loop through
+        valid_emails = set()
         for link in parser.data:
             # open URL.
             # read = requests.get(link)
             try:
                 read = requests.get(link)
                 # Get the valid email.
-                email = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
+                emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
                 # If not in list then append it.
-                if email not in emails:
-                    emails.append(email)
+                for email in emails:
+                    valid_emails.add(email)
             except ValueError:
                 pass
-
-        # Remove duplicates email address.
-        for Email in emails:
-            for e in Email:
-                if e not in valid_emails:
-                    valid_emails.append(e)
     except ValueError:
         exit(-1)
 
     # Finally print list of email.
-    return valid_emails
+    return sorted(valid_emails)
 
 
 if __name__ == "__main__":

From 47d15bbfd89c6cb72abc2665e2c8e9560361e5d9 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Wed, 26 Feb 2020 11:31:57 +0100
Subject: [PATCH 7/8] Update emails_from_url.py

---
 web_programming/emails_from_url.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py
index 8d0384095a2e..ee138a646395 100644
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@@ -95,7 +95,7 @@ def emails_from_url(url: str = "https://github.com") -> list:
     except ValueError:
         exit(-1)
 
-    # Finally print list of email.
+    # Finally return a sorted list of email addresses with no duplicates.
     return sorted(valid_emails)
 
 

From f0f0118a9805da1921e46e3ff6238fbfe2ed3a45 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Wed, 26 Feb 2020 11:34:35 +0100
Subject: [PATCH 8/8] Add type hints and doctests

---
 web_programming/emails_from_url.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/web_programming/emails_from_url.py b/web_programming/emails_from_url.py
index ee138a646395..fba9f769bace 100644
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@@ -14,12 +14,12 @@
 
 
 class Parser(HTMLParser):
-    def __init__(self, domain):
+    def __init__(self, domain: str):
         HTMLParser.__init__(self)
         self.data = []
         self.domain = domain
 
-    def handle_starttag(self, tag, attrs):
+    def handle_starttag(self, tag: str, attrs: str) -> None:
         """
         This function parse html to take takes url from tags
         """
@@ -36,7 +36,7 @@ def handle_starttag(self, tag, attrs):
 
 
 # Get main domain name (example.com)
-def get_domain_name(url):
+def get_domain_name(url: str) -> str:
     """
     This function get the main domain name
 
@@ -49,7 +49,7 @@ def get_domain_name(url):
 
 
 # Get sub domain name (sub.example.com)
-def get_sub_domain_name(url):
+def get_sub_domain_name(url: str) -> str:
     """
     This function get sub domin name