From 24632d4320c7c83849446272a518511c4201bee2 Mon Sep 17 00:00:00 2001
From: = <danielmurph8@gmail.com>
Date: Sun, 21 Jun 2020 12:11:04 -0400
Subject: [PATCH 01/21] NLP Word Frequency Algorithms

---
 machine_learning/word_frequency_functions.py | 136 +++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 machine_learning/word_frequency_functions.py

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
new file mode 100644
index 000000000000..a709f029ce13
--- /dev/null
+++ b/machine_learning/word_frequency_functions.py
@@ -0,0 +1,136 @@
+import string
+from math import log10
+
+""" Here I've implemented several word frequency functions
+    that are commonly used in information retrieval: Term Frequency,
+    Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency)
+    are included.
+
+    Term Frequency is a statistical function that
+    returns a number representing how frequently
+    an expression occurs in a document.This
+    indicates how significant a particular term is in
+    a given document.
+
+    Document Frequency is a statistical function that returns
+    an integer representing
+    the number of documents in a corpus that a term occurs in
+    (where the max integer returned would be the number of
+    documents in the corpus).
+
+    Inverse Document Frequency is mathematically written as
+    log10(N/df), where N is the number of documents in your
+    corpus and df is the Document Frequency. If df is 0, a
+    ZeroDivisionError will be thrown.
+
+    Term-Frequency*Inverse-Document-Frequency is a measure
+    of the originality of a term. It is mathematically written
+    as tf*log10(N/df). It compares the number of times
+    a term appears in a document with the number of documents
+    the term appears in. If df is 0, a ZeroDivisionError will be thrown.
+"""
+
+
+def term_frequency(term, document):
+    """
+    A function that returns the number of times a term occurs within
+    a given document.
+    @params: term, the term to search a document for, and document,
+            the document to search within
+    @returns: an integer representing the number of times a term is
+            found within the document
+
+    @examples:
+    >>> document = "To be, or not to be"
+    >>> term = "to"
+    2
+
+    >>> document = "Natural Language Processing is a subfield of Artificial Intelligence
+                    concerned with interactions between computers and human languages"
+    >>> term = "NLP"
+    0
+    """
+    # strip all punctuation and newlines and replace it with ''
+    document_without_punctuation = document.translate(
+        str.maketrans("", "", string.punctuation)
+    ).replace("\n", "")
+    tokenize_document = document_without_punctuation.split(" ")  # word tokenization
+    term_frequency = len(
+        [word for word in tokenize_document if word.lower() == term.lower()]
+    )
+    return term_frequency
+
+
+def document_frequency(term, corpus):
+    """
+    A function that calculates the number of documents in a corpus that contain a
+    given term
+    @params : term, the term to search each document for, and corpus, a collection of
+             documents. Each document should be separated by a newline.
+    @returns : the number of documents in the corpus that contain the term you are
+               searching for and the number of documents in the corpus
+    @examples :
+    >>> corpus =
+                "This is the first document in the corpus.\n
+                ThIs is the second document in the corpus.\n
+                THIS is the third document in the corpus."
+    >>> term = "first"
+    1
+    >>> term = "document"
+    3
+    >>> term = "this"
+    3
+    """
+    corpus_without_punctuation = corpus.translate(
+        str.maketrans("", "", string.punctuation)
+    )  # strip all punctuation and replace it with ''
+    documents = corpus_without_punctuation.split("\n")
+    lowercase_documents = [document.lower() for document in documents]
+    document_frequency = len(
+        [document for document in lowercase_documents if term.lower() in document]
+    )  # number of documents that contain the term
+    return document_frequency, len(documents)
+
+
+def inverse_document_frequency(df, N):
+    """
+    A function that returns an integer denoting the importance
+    of a word. This measure of importance is
+    calculated by log10(N/df), where N is the
+    number of documents and df is
+    the Document Frequency.
+    @params : df, the Document Frequency, and corpus,
+    a collection of documents separated
+             by a newline.
+    @returns : log10(N/df)
+    @examples :
+    >>> df = 1
+    >>> corpus =
+                "This is the first document in the corpus.\n
+                ThIs is the second document in the corpus.\n
+                THIS is the third document in the corpus."
+    log10(3/1) = .477
+    >>> df = 3
+    log10(3/3) = log10(1) = 0
+    >>> df = 0
+    log10(3/0) -> throws ZeroDivisionError
+    """
+    try:
+        idf = round(log10(N / df), 3)
+        return idf
+    except ZeroDivisionError:
+        print("The term you searched for is not in the corpus.")
+
+
+def tf_idf(tf, idf):
+    """
+    A function that combines the term frequency
+    and inverse document frequency functions to
+    calculate the originality of a term. This
+    'originality' is calculated by multiplying
+    the term frequency and the inverse document
+    frequency : tf-idf = TF * IDF
+    @params : tf, the term frequency, and idf, the inverse document
+    frequency
+    """
+    return round(tf * idf, 3)

From cbb5f41d122a39ae170864b5a1bb82400f9ea127 Mon Sep 17 00:00:00 2001
From: = <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 17:23:51 -0400
Subject: [PATCH 02/21] Added type hints and Wikipedia link to tf-idf

---
 machine_learning/word_frequency_functions.py | 39 +++++++++++---------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index a709f029ce13..a9e8211543cd 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -1,22 +1,31 @@
 import string
 from math import log10
 
-""" Here I've implemented several word frequency functions
+"""
+    tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+    tf-idf and other word frequency algorithms are often used
+    as a weighting factor in information retrieval and text
+    mining. 83% of text-based recommender systems use
+    tf-idf for term weighting. In Layman's terms, tf-idf
+    is a statistic intended to reflect how important a word
+    is to a document in a corpus (a collection of documents)
+
+
+    Here I've implemented several word frequency algorithms
     that are commonly used in information retrieval: Term Frequency,
     Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency)
     are included.
 
     Term Frequency is a statistical function that
     returns a number representing how frequently
-    an expression occurs in a document.This
+    an expression occurs in a document. This
     indicates how significant a particular term is in
     a given document.
 
     Document Frequency is a statistical function that returns
-    an integer representing
-    the number of documents in a corpus that a term occurs in
-    (where the max integer returned would be the number of
-    documents in the corpus).
+    an integer representing the number of documents in a
+    corpus that a term occurs in (where the max number returned
+    would be the number of documents in the corpus).
 
     Inverse Document Frequency is mathematically written as
     log10(N/df), where N is the number of documents in your
@@ -31,7 +40,7 @@
 """
 
 
-def term_frequency(term, document):
+def term_frequency(term : str, document : str) -> int:
     """
     A function that returns the number of times a term occurs within
     a given document.
@@ -61,7 +70,7 @@ def term_frequency(term, document):
     return term_frequency
 
 
-def document_frequency(term, corpus):
+def document_frequency(term: str, corpus: str) -> int:
     """
     A function that calculates the number of documents in a corpus that contain a
     given term
@@ -92,23 +101,19 @@ def document_frequency(term, corpus):
     return document_frequency, len(documents)
 
 
-def inverse_document_frequency(df, N):
+def inverse_document_frequency(df : int, N: int) -> int:
     """
     A function that returns an integer denoting the importance
     of a word. This measure of importance is
     calculated by log10(N/df), where N is the
     number of documents and df is
     the Document Frequency.
-    @params : df, the Document Frequency, and corpus,
-    a collection of documents separated
-             by a newline.
+    @params : df, the Document Frequency, and N,
+    the number of documents in the corpus.
     @returns : log10(N/df)
     @examples :
     >>> df = 1
-    >>> corpus =
-                "This is the first document in the corpus.\n
-                ThIs is the second document in the corpus.\n
-                THIS is the third document in the corpus."
+    >>> N = 3
     log10(3/1) = .477
     >>> df = 3
     log10(3/3) = log10(1) = 0
@@ -122,7 +127,7 @@ def inverse_document_frequency(df, N):
         print("The term you searched for is not in the corpus.")
 
 
-def tf_idf(tf, idf):
+def tf_idf(tf : int, idf: int) -> int:
     """
     A function that combines the term frequency
     and inverse document frequency functions to

From eb260b0b3664a17a8e3e8c6ddfb827849f8ab8d4 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 17:24:53 -0400
Subject: [PATCH 03/21] Update machine_learning/word_frequency_functions.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/word_frequency_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index a9e8211543cd..15c6d8153b4c 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -51,7 +51,7 @@ def term_frequency(term : str, document : str) -> int:
 
     @examples:
     >>> document = "To be, or not to be"
-    >>> term = "to"
+    >>> term_frequency("to", "To be, or not to be")
     2
 
     >>> document = "Natural Language Processing is a subfield of Artificial Intelligence

From e961f523b864dd8a87d7b7394e39e76ad56fb26a Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 17:25:02 -0400
Subject: [PATCH 04/21] Update machine_learning/word_frequency_functions.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/word_frequency_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index 15c6d8153b4c..8123b6479b74 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -54,7 +54,7 @@ def term_frequency(term : str, document : str) -> int:
     >>> term_frequency("to", "To be, or not to be")
     2
 
-    >>> document = "Natural Language Processing is a subfield of Artificial Intelligence
+    >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence "
                     concerned with interactions between computers and human languages"
     >>> term = "NLP"
     0

From e6b2357cedbcdd362d568bfacbde8cbfa3798bbd Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 17:25:13 -0400
Subject: [PATCH 05/21] Update machine_learning/word_frequency_functions.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/word_frequency_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index 8123b6479b74..0b96b2e0de92 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -55,7 +55,7 @@ def term_frequency(term : str, document : str) -> int:
     2
 
     >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence "
-                    concerned with interactions between computers and human languages"
+         ...            "concerned with interactions between computers and human languages")
     >>> term = "NLP"
     0
     """

From aa61ec8247eff5145823e5d2de9e3915d6cdbd45 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 17:25:22 -0400
Subject: [PATCH 06/21] Update machine_learning/word_frequency_functions.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/word_frequency_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index 0b96b2e0de92..be78b78b01b3 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -56,7 +56,7 @@ def term_frequency(term : str, document : str) -> int:
 
     >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence "
          ...            "concerned with interactions between computers and human languages")
-    >>> term = "NLP"
+    >>> term_frequency("NLP", document)
     0
     """
     # strip all punctuation and newlines and replace it with ''

From bed579d82575627e5ffb04da76aafc2414bfae11 Mon Sep 17 00:00:00 2001
From: = <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 17:44:12 -0400
Subject: [PATCH 07/21] Fix line length for flake8

---
 machine_learning/word_frequency_functions.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index a9e8211543cd..684bcad7c69c 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -54,8 +54,9 @@ def term_frequency(term : str, document : str) -> int:
     >>> term = "to"
     2
 
-    >>> document = "Natural Language Processing is a subfield of Artificial Intelligence
-                    concerned with interactions between computers and human languages"
+    >>> document = "Natural Language Processing is a subfield of
+    Artificial Intelligence concerned with interactions
+    between computers and human languages"
     >>> term = "NLP"
     0
     """

From 9ef8e626947c4471efd62f73c596603800112a87 Mon Sep 17 00:00:00 2001
From: = <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 17:52:15 -0400
Subject: [PATCH 08/21] Fix line length for flake8 V2

---
 machine_learning/word_frequency_functions.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index 073d522431d5..eb0f311b7ed3 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -54,16 +54,10 @@ def term_frequency(term : str, document : str) -> int:
     >>> term_frequency("to", "To be, or not to be")
     2
 
-<<<<<<< HEAD
     >>> document = "Natural Language Processing is a subfield of
     Artificial Intelligence concerned with interactions
     between computers and human languages"
     >>> term = "NLP"
-=======
-    >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence "
-         ...            "concerned with interactions between computers and human languages")
-    >>> term_frequency("NLP", document)
->>>>>>> aa61ec8247eff5145823e5d2de9e3915d6cdbd45
     0
     """
     # strip all punctuation and newlines and replace it with ''

From 1152eddfc13313aa7cfd2d7d6c4b037e567f46be Mon Sep 17 00:00:00 2001
From: = <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 18:44:05 -0400
Subject: [PATCH 09/21] Add line escapes and change int to float

---
 machine_learning/word_frequency_functions.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index eb0f311b7ed3..f877dd839425 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -54,8 +54,8 @@ def term_frequency(term : str, document : str) -> int:
     >>> term_frequency("to", "To be, or not to be")
     2
 
-    >>> document = "Natural Language Processing is a subfield of
-    Artificial Intelligence concerned with interactions
+    >>> document = "Natural Language Processing is a subfield of \
+    Artificial Intelligence concerned with interactions \
     between computers and human languages"
     >>> term = "NLP"
     0
@@ -80,7 +80,7 @@ def document_frequency(term: str, corpus: str) -> int:
     @returns : the number of documents in the corpus that contain the term you are
                searching for and the number of documents in the corpus
     @examples :
-    >>> corpus =
+    >>> corpus = \
                 "This is the first document in the corpus.\n
                 ThIs is the second document in the corpus.\n
                 THIS is the third document in the corpus."
@@ -102,7 +102,7 @@ def document_frequency(term: str, corpus: str) -> int:
     return document_frequency, len(documents)
 
 
-def inverse_document_frequency(df : int, N: int) -> int:
+def inverse_document_frequency(df : int, N: int) -> float:
     """
     A function that returns an integer denoting the importance
     of a word. This measure of importance is
@@ -128,7 +128,7 @@ def inverse_document_frequency(df : int, N: int) -> int:
         print("The term you searched for is not in the corpus.")
 
 
-def tf_idf(tf : int, idf: int) -> int:
+def tf_idf(tf : int, idf: int) -> float:
     """
     A function that combines the term frequency
     and inverse document frequency functions to

From e8890d62f388129ac85bddb63efaf22be44adc56 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 20:27:11 -0400
Subject: [PATCH 10/21] Corrected doctests

---
 machine_learning/word_frequency_functions.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index f877dd839425..033a1ad47365 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -53,12 +53,6 @@ def term_frequency(term : str, document : str) -> int:
     >>> document = "To be, or not to be"
     >>> term_frequency("to", "To be, or not to be")
     2
-
-    >>> document = "Natural Language Processing is a subfield of \
-    Artificial Intelligence concerned with interactions \
-    between computers and human languages"
-    >>> term = "NLP"
-    0
     """
     # strip all punctuation and newlines and replace it with ''
     document_without_punctuation = document.translate(
@@ -81,8 +75,8 @@ def document_frequency(term: str, corpus: str) -> int:
                searching for and the number of documents in the corpus
     @examples :
     >>> corpus = \
-                "This is the first document in the corpus.\n
-                ThIs is the second document in the corpus.\n
+                "This is the first document in the corpus.\n \
+                ThIs is the second document in the corpus.\n \
                 THIS is the third document in the corpus."
     >>> term = "first"
     1
@@ -115,9 +109,9 @@ def inverse_document_frequency(df : int, N: int) -> float:
     @examples :
     >>> df = 1
     >>> N = 3
-    log10(3/1) = .477
+    .477
     >>> df = 3
-    log10(3/3) = log10(1) = 0
+    0
     >>> df = 0
     log10(3/0) -> throws ZeroDivisionError
     """

From bcbb8f680ec415fc9deb9380e9fd0736afe843fc Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 20:57:43 -0400
Subject: [PATCH 11/21] Fix for TravisCI

---
 machine_learning/word_frequency_functions.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index 033a1ad47365..d33adb0e8dec 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -75,9 +75,7 @@ def document_frequency(term: str, corpus: str) -> int:
                searching for and the number of documents in the corpus
     @examples :
     >>> corpus = \
-                "This is the first document in the corpus.\n \
-                ThIs is the second document in the corpus.\n \
-                THIS is the third document in the corpus."
+                "This is the first document in the corpus.\n ThIs is the second document in the corpus. \n THIS is the third document in the corpus."
     >>> term = "first"
     1
     >>> term = "document"
@@ -110,10 +108,6 @@ def inverse_document_frequency(df : int, N: int) -> float:
     >>> df = 1
     >>> N = 3
     .477
-    >>> df = 3
-    0
-    >>> df = 0
-    log10(3/0) -> throws ZeroDivisionError
     """
     try:
         idf = round(log10(N / df), 3)

From a2628d47bcc6ade369b8bd9211c2a0ba56848834 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Mon, 22 Jun 2020 21:10:37 -0400
Subject: [PATCH 12/21] Fix for TravisCI V2

---
 machine_learning/word_frequency_functions.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index d33adb0e8dec..22627e20e018 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -75,7 +75,9 @@ def document_frequency(term: str, corpus: str) -> int:
                searching for and the number of documents in the corpus
     @examples :
     >>> corpus = \
-                "This is the first document in the corpus.\n ThIs is the second document in the corpus. \n THIS is the third document in the corpus."
+                "This is the first document in the corpus.\n ThIs is \
+                    the second document in the corpus. \n THIS is \
+                        the third document in the corpus."
     >>> term = "first"
     1
     >>> term = "document"

From a0bef59b3aa392a40e7afbb93e10198ac76aebf4 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Tue, 23 Jun 2020 11:42:37 -0400
Subject: [PATCH 13/21] Tests passing locally

---
 machine_learning/word_frequency_functions.py | 23 ++++++++------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index 22627e20e018..7abcbf2b4f08 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -50,7 +50,6 @@ def term_frequency(term : str, document : str) -> int:
             found within the document
 
     @examples:
-    >>> document = "To be, or not to be"
     >>> term_frequency("to", "To be, or not to be")
     2
     """
@@ -74,16 +73,10 @@ def document_frequency(term: str, corpus: str) -> int:
     @returns : the number of documents in the corpus that contain the term you are
                searching for and the number of documents in the corpus
     @examples :
-    >>> corpus = \
-                "This is the first document in the corpus.\n ThIs is \
-                    the second document in the corpus. \n THIS is \
-                        the third document in the corpus."
-    >>> term = "first"
-    1
-    >>> term = "document"
-    3
-    >>> term = "this"
-    3
+    >>> document_frequency("first", "This is the first document in the corpus.\\nThIs is\
+the second document in the corpus.\\nTHIS is \
+the third document in the corpus.")
+    (1, 3)
     """
     corpus_without_punctuation = corpus.translate(
         str.maketrans("", "", string.punctuation)
@@ -107,9 +100,8 @@ def inverse_document_frequency(df : int, N: int) -> float:
     the number of documents in the corpus.
     @returns : log10(N/df)
     @examples :
-    >>> df = 1
-    >>> N = 3
-    .477
+    >>> inverse_document_frequency(1, 3)
+    0.477
     """
     try:
         idf = round(log10(N / df), 3)
@@ -128,5 +120,8 @@ def tf_idf(tf : int, idf: int) -> float:
     frequency : tf-idf = TF * IDF
     @params : tf, the term frequency, and idf, the inverse document
     frequency
+    @examples : 
+    >>> tf_idf(2, 0.477)
+    0.954
     """
     return round(tf * idf, 3)

From 4cd803ae586e22fab762cf822d3dfdb40922a101 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Tue, 23 Jun 2020 11:49:36 -0400
Subject: [PATCH 14/21] Tests passing locally

---
 machine_learning/word_frequency_functions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index 7abcbf2b4f08..acf72f80f4b8 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -73,8 +73,8 @@ def document_frequency(term: str, corpus: str) -> int:
     @returns : the number of documents in the corpus that contain the term you are
                searching for and the number of documents in the corpus
     @examples :
-    >>> document_frequency("first", "This is the first document in the corpus.\\nThIs is\
-the second document in the corpus.\\nTHIS is \
+    >>> document_frequency("first", "This is the first document in the corpus.\\nThIs\
+is the second document in the corpus.\\nTHIS is \
 the third document in the corpus.")
     (1, 3)
     """
@@ -120,7 +120,7 @@ def tf_idf(tf : int, idf: int) -> float:
     frequency : tf-idf = TF * IDF
     @params : tf, the term frequency, and idf, the inverse document
     frequency
-    @examples : 
+    @examples :
     >>> tf_idf(2, 0.477)
     0.954
     """

From fcc07c930ecb616c27d028f98169406c8def7f52 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Wed, 24 Jun 2020 15:37:30 -0400
Subject: [PATCH 15/21] Update machine_learning/word_frequency_functions.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/word_frequency_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index acf72f80f4b8..6b3fc43f812d 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -66,7 +66,7 @@ def term_frequency(term : str, document : str) -> int:
 
 def document_frequency(term: str, corpus: str) -> int:
     """
-    A function that calculates the number of documents in a corpus that contain a
+    Calculate the number of documents in a corpus that contain a
     given term
     @params : term, the term to search each document for, and corpus, a collection of
              documents. Each document should be separated by a newline.

From d35b5a69048eb01393558bcc355a15c11a69d41a Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Wed, 24 Jun 2020 15:37:36 -0400
Subject: [PATCH 16/21] Update machine_learning/word_frequency_functions.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/word_frequency_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index 6b3fc43f812d..fc854fd5f67b 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -91,7 +91,7 @@ def document_frequency(term: str, corpus: str) -> int:
 
 def inverse_document_frequency(df : int, N: int) -> float:
     """
-    A function that returns an integer denoting the importance
+    Return an integer denoting the importance
     of a word. This measure of importance is
     calculated by log10(N/df), where N is the
     number of documents and df is

From e901e096204eb00176c79fa2b3e7dd8f49118cf4 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Wed, 24 Jun 2020 15:37:45 -0400
Subject: [PATCH 17/21] Update machine_learning/word_frequency_functions.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/word_frequency_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index fc854fd5f67b..faae5a81c6eb 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -42,7 +42,7 @@
 
 def term_frequency(term : str, document : str) -> int:
     """
-    A function that returns the number of times a term occurs within
+    Return the number of times a term occurs within
     a given document.
     @params: term, the term to search a document for, and document,
             the document to search within

From 0a85c0fbe1375f17cbde80f14764c49c08db4cd8 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Wed, 24 Jun 2020 15:37:51 -0400
Subject: [PATCH 18/21] Update machine_learning/word_frequency_functions.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/word_frequency_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index faae5a81c6eb..4434a6de808b 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -112,7 +112,7 @@ def inverse_document_frequency(df : int, N: int) -> float:
 
 def tf_idf(tf : int, idf: int) -> float:
     """
-    A function that combines the term frequency
+    Combine the term frequency
     and inverse document frequency functions to
     calculate the originality of a term. This
     'originality' is calculated by multiplying

From fcef21e5889f601279f9d269ddfa322a60e01ad0 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Wed, 24 Jun 2020 16:15:30 -0400
Subject: [PATCH 19/21] Add doctest examples and clean up docstrings

---
 machine_learning/word_frequency_functions.py | 34 ++++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
index acf72f80f4b8..a105e30f5d3b 100644
--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@@ -42,7 +42,7 @@
 
 def term_frequency(term : str, document : str) -> int:
     """
-    A function that returns the number of times a term occurs within
+    Return the number of times a term occurs within
     a given document.
     @params: term, the term to search a document for, and document,
             the document to search within
@@ -58,15 +58,14 @@ def term_frequency(term : str, document : str) -> int:
         str.maketrans("", "", string.punctuation)
     ).replace("\n", "")
     tokenize_document = document_without_punctuation.split(" ")  # word tokenization
-    term_frequency = len(
+    return len(
         [word for word in tokenize_document if word.lower() == term.lower()]
     )
-    return term_frequency
 
 
 def document_frequency(term: str, corpus: str) -> int:
     """
-    A function that calculates the number of documents in a corpus that contain a
+    Calculate the number of documents in a corpus that contain a
     given term
     @params : term, the term to search each document for, and corpus, a collection of
              documents. Each document should be separated by a newline.
@@ -83,15 +82,14 @@ def document_frequency(term: str, corpus: str) -> int:
     )  # strip all punctuation and replace it with ''
     documents = corpus_without_punctuation.split("\n")
     lowercase_documents = [document.lower() for document in documents]
-    document_frequency = len(
+    return len(
         [document for document in lowercase_documents if term.lower() in document]
-    )  # number of documents that contain the term
-    return document_frequency, len(documents)
+    ), len(documents)
 
 
 def inverse_document_frequency(df : int, N: int) -> float:
     """
-    A function that returns an integer denoting the importance
+    Return an integer denoting the importance
     of a word. This measure of importance is
     calculated by log10(N/df), where N is the
     number of documents and df is
@@ -100,19 +98,27 @@ def inverse_document_frequency(df : int, N: int) -> float:
     the number of documents in the corpus.
     @returns : log10(N/df)
     @examples :
+    >>> inverse_document_frequency(3, 0)
+    Traceback (most recent call last):
+     ...
+    ValueError: log10(0) is undefined.
     >>> inverse_document_frequency(1, 3)
     0.477
+    >>> inverse_document_frequency(0, 3)
+    Traceback (most recent call last):
+     ...
+    ZeroDivisionError: df must be > 0
     """
-    try:
-        idf = round(log10(N / df), 3)
-        return idf
-    except ZeroDivisionError:
-        print("The term you searched for is not in the corpus.")
+    if df == 0:
+        raise ZeroDivisionError("df must be > 0")
+    elif N == 0:
+        raise ValueError("log10(0) is undefined.")
+    return round(log10(N / df), 3)
 
 
 def tf_idf(tf : int, idf: int) -> float:
     """
-    A function that combines the term frequency
+    Combine the term frequency
     and inverse document frequency functions to
     calculate the originality of a term. This
     'originality' is calculated by multiplying

From f669051e95d3e63006ae9338dfe2022811a89e15 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Thu, 9 Jul 2020 14:57:38 -0400
Subject: [PATCH 20/21] Added Standardization and Normalization algorithms

---
 machine_learning/data_transformations.py | 101 +++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 machine_learning/data_transformations.py

diff --git a/machine_learning/data_transformations.py b/machine_learning/data_transformations.py
new file mode 100644
index 000000000000..ff3f8d53ab81
--- /dev/null
+++ b/machine_learning/data_transformations.py
@@ -0,0 +1,101 @@
+"""
+    Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization
+    Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization
+
+    Normalization is the process of converting numerical data to a standard
+    range of values. This range is typically between [0, 1] or [-1, 1].
+    The equation for normalization is x_norm = (x - x_min)/(x_max - x_min)
+    where x_norm is the normalized value, x is the value, x_min is the
+    minimum value within the column or list of data, and x_max is the
+    maximum value within the column or list of data. Normalization is
+    used to speed up the training of data and put all of the data
+    on a similar scale. This is useful because variance in the range of
+    values of a dataset can heavily impact optimization
+    (particularly Gradient Descent).
+
+    Standardization is the process of converting numerical data to a normally
+    distributed range of values. This range will have a mean of 0 and standard
+    deviation of 1. This is also known as z-score normalization. The equation for
+    standardization is x_std = (x - mu)/(sigma) where mu is the mean of the
+    column or list of values and sigma is the standard deviation of the column
+    or list of values.
+
+    Choosing between Normalization & Standardization is more of an art of a science,
+    but it is often recommended to run experiments with both to see which performs
+    better. Additionally, a few rules of thumb are:
+        1. gaussian (normal) distributions work better with standardization
+        2. non-gaussian (non-normal) distributions work better with normalization
+        3. If a column or list of values has extreme values / outliers, use
+        standardization
+"""
+
+
+def normalization(data : list) -> list:
+    """
+    Returns a normalized list of values
+    @params: data, a list of values to normalize
+    @returns: a list of normalized values (rounded to 3 decimals)
+    @examples:
+    >>> normalization([2, 7, 10, 20, 30, 50])
+    [0.0, 0.104, 0.167, 0.375, 0.583, 1.0]
+
+    >>> normalization([5, 10, 15, 20, 25])
+    [0.0, 0.25, 0.5, 0.75, 1.0]
+    """
+    # variables for calculation
+    x_min = min(data)
+    x_max = max(data)
+    # normalize data
+    return [round((x - x_min) / (x_max - x_min), 3) for x in data]
+
+
+def standardization(data : list) -> list:
+    """
+    Returns a standardized list of values
+    @params: data, a list of values to standardize
+    @returns: a list of standardized values (rounded to 3 decimals)
+    @examples:
+    >>> standardization([2, 7, 10, 20, 30, 50])
+    [-1.095, -0.788, -0.604, 0.01, 0.624, 1.852]
+
+    >>> standardization([5, 10, 15, 20, 25])
+    [-1.414, -0.707, 0.0, 0.707, 1.414]
+    """
+    # variables for calculation
+    mu = mean(data)
+    sigma = stdDeviation(data)
+
+    # standardize data
+    return [round((x - mu) / (sigma), 3) for x in data]
+
+
+def mean(data : list) -> float:
+    """
+    Helper function that returns the mean of a list of values
+    @params: data, a list of values
+    @returns: a float representing the mean (rounded to 3 decimals)
+    @examples:
+    >>> mean([2, 7, 10, 20, 30, 50])
+    19.833
+
+    >>> mean([5, 10, 15, 20, 25])
+    15.0
+    """
+    return round(sum(data) / len(data), 3)
+
+
+def stdDeviation(data : list) -> float:
+    """
+    Helper function that returns the standard deviation of a list of values
+    @params: data, a list of values
+    @returns: a float representing the standard deviation (rounded to 3 values)
+    @examples:
+    >>> stdDeviation([2, 7, 10, 20, 30, 50])
+    16.293
+
+    >>> stdDeviation([5, 10, 15, 20, 25])
+    7.071
+    """
+    x_mean = mean(data)
+    sum_squared_diff = sum([(x - x_mean)**2 for x in data])
+    return round(((sum_squared_diff) / len(data))**.5, 3)

From f273d6a6b9986110b9060db0c4122b0029bc086b Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Thu, 9 Jul 2020 15:18:38 -0400
Subject: [PATCH 21/21] Delete word_frequency_functions.py

---
 machine_learning/word_frequency_functions.py | 133 -------------------
 1 file changed, 133 deletions(-)
 delete mode 100644 machine_learning/word_frequency_functions.py

diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
deleted file mode 100644
index 250741146623..000000000000
--- a/machine_learning/word_frequency_functions.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import string
-from math import log10
-
-"""
-    tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
-    tf-idf and other word frequency algorithms are often used
-    as a weighting factor in information retrieval and text
-    mining. 83% of text-based recommender systems use
-    tf-idf for term weighting. In Layman's terms, tf-idf
-    is a statistic intended to reflect how important a word
-    is to a document in a corpus (a collection of documents)
-
-
-    Here I've implemented several word frequency algorithms
-    that are commonly used in information retrieval: Term Frequency,
-    Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency)
-    are included.
-
-    Term Frequency is a statistical function that
-    returns a number representing how frequently
-    an expression occurs in a document. This
-    indicates how significant a particular term is in
-    a given document.
-
-    Document Frequency is a statistical function that returns
-    an integer representing the number of documents in a
-    corpus that a term occurs in (where the max number returned
-    would be the number of documents in the corpus).
-
-    Inverse Document Frequency is mathematically written as
-    log10(N/df), where N is the number of documents in your
-    corpus and df is the Document Frequency. If df is 0, a
-    ZeroDivisionError will be thrown.
-
-    Term-Frequency*Inverse-Document-Frequency is a measure
-    of the originality of a term. It is mathematically written
-    as tf*log10(N/df). It compares the number of times
-    a term appears in a document with the number of documents
-    the term appears in. If df is 0, a ZeroDivisionError will be thrown.
-"""
-
-
-def term_frequency(term: str, document: str) -> int:
-    """
-    Return the number of times a term occurs within
-    a given document.
-    @params: term, the term to search a document for, and document,
-            the document to search within
-    @returns: an integer representing the number of times a term is
-            found within the document
-
-    @examples:
-    >>> term_frequency("to", "To be, or not to be")
-    2
-    """
-    # strip all punctuation and newlines and replace it with ''
-    document_without_punctuation = document.translate(
-        str.maketrans("", "", string.punctuation)
-    ).replace("\n", "")
-    tokenize_document = document_without_punctuation.split(" ")  # word tokenization
-    return len(
-        [word for word in tokenize_document if word.lower() == term.lower()]
-    )
-
-
-def document_frequency(term: str, corpus: str) -> int:
-    """
-    Calculate the number of documents in a corpus that contain a
-    given term
-    @params : term, the term to search each document for, and corpus, a collection of
-             documents. Each document should be separated by a newline.
-    @returns : the number of documents in the corpus that contain the term you are
-               searching for and the number of documents in the corpus
-    @examples :
-    >>> document_frequency("first", "This is the first document in the corpus.\\nThIs\
-is the second document in the corpus.\\nTHIS is \
-the third document in the corpus.")
-    (1, 3)
-    """
-    corpus_without_punctuation = corpus.translate(
-        str.maketrans("", "", string.punctuation)
-    )  # strip all punctuation and replace it with ''
-    documents = corpus_without_punctuation.split("\n")
-    lowercase_documents = [document.lower() for document in documents]
-    return len(
-        [document for document in lowercase_documents if term.lower() in document]
-    ), len(documents)
-
-
-def inverse_document_frequency(df: int, N: int) -> float:
-    """
-    Return an integer denoting the importance
-    of a word. This measure of importance is
-    calculated by log10(N/df), where N is the
-    number of documents and df is
-    the Document Frequency.
-    @params : df, the Document Frequency, and N,
-    the number of documents in the corpus.
-    @returns : log10(N/df)
-    @examples :
-    >>> inverse_document_frequency(3, 0)
-    Traceback (most recent call last):
-     ...
-    ValueError: log10(0) is undefined.
-    >>> inverse_document_frequency(1, 3)
-    0.477
-    >>> inverse_document_frequency(0, 3)
-    Traceback (most recent call last):
-     ...
-    ZeroDivisionError: df must be > 0
-    """
-    if df == 0:
-        raise ZeroDivisionError("df must be > 0")
-    elif N == 0:
-        raise ValueError("log10(0) is undefined.")
-    return round(log10(N / df), 3)
-
-
-def tf_idf(tf: int, idf: int) -> float:
-    """
-    Combine the term frequency
-    and inverse document frequency functions to
-    calculate the originality of a term. This
-    'originality' is calculated by multiplying
-    the term frequency and the inverse document
-    frequency : tf-idf = TF * IDF
-    @params : tf, the term frequency, and idf, the inverse document
-    frequency
-    @examples :
-    >>> tf_idf(2, 0.477)
-    0.954
-    """
-    return round(tf * idf, 3)
\ No newline at end of file