From 24632d4320c7c83849446272a518511c4201bee2 Mon Sep 17 00:00:00 2001 From: = Date: Sun, 21 Jun 2020 12:11:04 -0400 Subject: [PATCH 01/21] NLP Word Frequency Algorithms --- machine_learning/word_frequency_functions.py | 136 +++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 machine_learning/word_frequency_functions.py diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py new file mode 100644 index 000000000000..a709f029ce13 --- /dev/null +++ b/machine_learning/word_frequency_functions.py @@ -0,0 +1,136 @@ +import string +from math import log10 + +""" Here I've implemented several word frequency functions + that are commonly used in information retrieval: Term Frequency, + Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency) + are included. + + Term Frequency is a statistical function that + returns a number representing how frequently + an expression occurs in a document.This + indicates how significant a particular term is in + a given document. + + Document Frequency is a statistical function that returns + an integer representing + the number of documents in a corpus that a term occurs in + (where the max integer returned would be the number of + documents in the corpus). + + Inverse Document Frequency is mathematically written as + log10(N/df), where N is the number of documents in your + corpus and df is the Document Frequency. If df is 0, a + ZeroDivisionError will be thrown. + + Term-Frequency*Inverse-Document-Frequency is a measure + of the originality of a term. It is mathematically written + as tf*log10(N/df). It compares the number of times + a term appears in a document with the number of documents + the term appears in. If df is 0, a ZeroDivisionError will be thrown. +""" + + +def term_frequency(term, document): + """ + A function that returns the number of times a term occurs within + a given document. + @params: term, the term to search a document for, and document, + the document to search within + @returns: an integer representing the number of times a term is + found within the document + + @examples: + >>> document = "To be, or not to be" + >>> term = "to" + 2 + + >>> document = "Natural Language Processing is a subfield of Artificial Intelligence + concerned with interactions between computers and human languages" + >>> term = "NLP" + 0 + """ + # strip all punctuation and newlines and replace it with '' + document_without_punctuation = document.translate( + str.maketrans("", "", string.punctuation) + ).replace("\n", "") + tokenize_document = document_without_punctuation.split(" ") # word tokenization + term_frequency = len( + [word for word in tokenize_document if word.lower() == term.lower()] + ) + return term_frequency + + +def document_frequency(term, corpus): + """ + A function that calculates the number of documents in a corpus that contain a + given term + @params : term, the term to search each document for, and corpus, a collection of + documents. Each document should be separated by a newline. + @returns : the number of documents in the corpus that contain the term you are + searching for and the number of documents in the corpus + @examples : + >>> corpus = + "This is the first document in the corpus.\n + ThIs is the second document in the corpus.\n + THIS is the third document in the corpus." + >>> term = "first" + 1 + >>> term = "document" + 3 + >>> term = "this" + 3 + """ + corpus_without_punctuation = corpus.translate( + str.maketrans("", "", string.punctuation) + ) # strip all punctuation and replace it with '' + documents = corpus_without_punctuation.split("\n") + lowercase_documents = [document.lower() for document in documents] + document_frequency = len( + [document for document in lowercase_documents if term.lower() in document] + ) # number of documents that contain the term + return document_frequency, len(documents) + + +def inverse_document_frequency(df, N): + """ + A function that returns an integer denoting the importance + of a word. This measure of importance is + calculated by log10(N/df), where N is the + number of documents and df is + the Document Frequency. + @params : df, the Document Frequency, and corpus, + a collection of documents separated + by a newline. + @returns : log10(N/df) + @examples : + >>> df = 1 + >>> corpus = + "This is the first document in the corpus.\n + ThIs is the second document in the corpus.\n + THIS is the third document in the corpus." + log10(3/1) = .477 + >>> df = 3 + log10(3/3) = log10(1) = 0 + >>> df = 0 + log10(3/0) -> throws ZeroDivisionError + """ + try: + idf = round(log10(N / df), 3) + return idf + except ZeroDivisionError: + print("The term you searched for is not in the corpus.") + + +def tf_idf(tf, idf): + """ + A function that combines the term frequency + and inverse document frequency functions to + calculate the originality of a term. This + 'originality' is calculated by multiplying + the term frequency and the inverse document + frequency : tf-idf = TF * IDF + @params : tf, the term frequency, and idf, the inverse document + frequency + """ + return round(tf * idf, 3) From cbb5f41d122a39ae170864b5a1bb82400f9ea127 Mon Sep 17 00:00:00 2001 From: = Date: Mon, 22 Jun 2020 17:23:51 -0400 Subject: [PATCH 02/21] Added type hints and Wikipedia link to tf-idf --- machine_learning/word_frequency_functions.py | 39 +++++++++++--------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index a709f029ce13..a9e8211543cd 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -1,22 +1,31 @@ import string from math import log10 -""" Here I've implemented several word frequency functions +""" + tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf + tf-idf and other word frequency algorithms are often used + as a weighting factor in information retrieval and text + mining. 83% of text-based recommender systems use + tf-idf for term weighting. In Layman's terms, tf-idf + is a statistic intended to reflect how important a word + is to a document in a corpus (a collection of documents) + + + Here I've implemented several word frequency algorithms that are commonly used in information retrieval: Term Frequency, Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency) are included. Term Frequency is a statistical function that returns a number representing how frequently - an expression occurs in a document.This + an expression occurs in a document. This indicates how significant a particular term is in a given document. Document Frequency is a statistical function that returns - an integer representing - the number of documents in a corpus that a term occurs in - (where the max integer returned would be the number of - documents in the corpus). + an integer representing the number of documents in a + corpus that a term occurs in (where the max number returned + would be the number of documents in the corpus). Inverse Document Frequency is mathematically written as log10(N/df), where N is the number of documents in your @@ -31,7 +40,7 @@ """ -def term_frequency(term, document): +def term_frequency(term : str, document : str) -> int: """ A function that returns the number of times a term occurs within a given document. @@ -61,7 +70,7 @@ def term_frequency(term, document): return term_frequency -def document_frequency(term, corpus): +def document_frequency(term: str, corpus: str) -> int: """ A function that calculates the number of documents in a corpus that contain a given term @@ -92,23 +101,19 @@ def document_frequency(term, corpus): return document_frequency, len(documents) -def inverse_document_frequency(df, N): +def inverse_document_frequency(df : int, N: int) -> int: """ A function that returns an integer denoting the importance of a word. This measure of importance is calculated by log10(N/df), where N is the number of documents and df is the Document Frequency. - @params : df, the Document Frequency, and corpus, - a collection of documents separated - by a newline. + @params : df, the Document Frequency, and N, + the number of documents in the corpus. @returns : log10(N/df) @examples : >>> df = 1 - >>> corpus = - "This is the first document in the corpus.\n - ThIs is the second document in the corpus.\n - THIS is the third document in the corpus." + >>> N = 3 log10(3/1) = .477 >>> df = 3 log10(3/3) = log10(1) = 0 @@ -122,7 +127,7 @@ def inverse_document_frequency(df, N): print("The term you searched for is not in the corpus.") -def tf_idf(tf, idf): +def tf_idf(tf : int, idf: int) -> int: """ A function that combines the term frequency and inverse document frequency functions to From eb260b0b3664a17a8e3e8c6ddfb827849f8ab8d4 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 17:24:53 -0400 Subject: [PATCH 03/21] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index a9e8211543cd..15c6d8153b4c 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -51,7 +51,7 @@ def term_frequency(term : str, document : str) -> int: @examples: >>> document = "To be, or not to be" - >>> term = "to" + >>> term_frequency("to", "To be, or not to be") 2 >>> document = "Natural Language Processing is a subfield of Artificial Intelligence From e961f523b864dd8a87d7b7394e39e76ad56fb26a Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 17:25:02 -0400 Subject: [PATCH 04/21] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 15c6d8153b4c..8123b6479b74 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -54,7 +54,7 @@ def term_frequency(term : str, document : str) -> int: >>> term_frequency("to", "To be, or not to be") 2 - >>> document = "Natural Language Processing is a subfield of Artificial Intelligence + >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence " concerned with interactions between computers and human languages" >>> term = "NLP" 0 From e6b2357cedbcdd362d568bfacbde8cbfa3798bbd Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 17:25:13 -0400 Subject: [PATCH 05/21] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 8123b6479b74..0b96b2e0de92 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -55,7 +55,7 @@ def term_frequency(term : str, document : str) -> int: 2 >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence " - concerned with interactions between computers and human languages" + ... "concerned with interactions between computers and human languages") >>> term = "NLP" 0 """ From aa61ec8247eff5145823e5d2de9e3915d6cdbd45 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 17:25:22 -0400 Subject: [PATCH 06/21] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 0b96b2e0de92..be78b78b01b3 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -56,7 +56,7 @@ def term_frequency(term : str, document : str) -> int: >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence " ... "concerned with interactions between computers and human languages") - >>> term = "NLP" + >>> term_frequency("NLP", document) 0 """ # strip all punctuation and newlines and replace it with '' From bed579d82575627e5ffb04da76aafc2414bfae11 Mon Sep 17 00:00:00 2001 From: = Date: Mon, 22 Jun 2020 17:44:12 -0400 Subject: [PATCH 07/21] Fix line length for flake8 --- machine_learning/word_frequency_functions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index a9e8211543cd..684bcad7c69c 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -54,8 +54,9 @@ def term_frequency(term : str, document : str) -> int: >>> term = "to" 2 - >>> document = "Natural Language Processing is a subfield of Artificial Intelligence - concerned with interactions between computers and human languages" + >>> document = "Natural Language Processing is a subfield of + Artificial Intelligence concerned with interactions + between computers and human languages" >>> term = "NLP" 0 """ From 9ef8e626947c4471efd62f73c596603800112a87 Mon Sep 17 00:00:00 2001 From: = Date: Mon, 22 Jun 2020 17:52:15 -0400 Subject: [PATCH 08/21] Fix line length for flake8 V2 --- machine_learning/word_frequency_functions.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 073d522431d5..eb0f311b7ed3 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -54,16 +54,10 @@ def term_frequency(term : str, document : str) -> int: >>> term_frequency("to", "To be, or not to be") 2 -<<<<<<< HEAD >>> document = "Natural Language Processing is a subfield of Artificial Intelligence concerned with interactions between computers and human languages" >>> term = "NLP" -======= - >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence " - ... "concerned with interactions between computers and human languages") - >>> term_frequency("NLP", document) ->>>>>>> aa61ec8247eff5145823e5d2de9e3915d6cdbd45 0 """ # strip all punctuation and newlines and replace it with '' From 1152eddfc13313aa7cfd2d7d6c4b037e567f46be Mon Sep 17 00:00:00 2001 From: = Date: Mon, 22 Jun 2020 18:44:05 -0400 Subject: [PATCH 09/21] Add line escapes and change int to float --- machine_learning/word_frequency_functions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index eb0f311b7ed3..f877dd839425 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -54,8 +54,8 @@ def term_frequency(term : str, document : str) -> int: >>> term_frequency("to", "To be, or not to be") 2 - >>> document = "Natural Language Processing is a subfield of - Artificial Intelligence concerned with interactions + >>> document = "Natural Language Processing is a subfield of \ + Artificial Intelligence concerned with interactions \ between computers and human languages" >>> term = "NLP" 0 @@ -80,7 +80,7 @@ def document_frequency(term: str, corpus: str) -> int: @returns : the number of documents in the corpus that contain the term you are searching for and the number of documents in the corpus @examples : - >>> corpus = + >>> corpus = \ "This is the first document in the corpus.\n ThIs is the second document in the corpus.\n THIS is the third document in the corpus." @@ -102,7 +102,7 @@ def document_frequency(term: str, corpus: str) -> int: return document_frequency, len(documents) -def inverse_document_frequency(df : int, N: int) -> int: +def inverse_document_frequency(df : int, N: int) -> float: """ A function that returns an integer denoting the importance of a word. This measure of importance is @@ -128,7 +128,7 @@ def inverse_document_frequency(df : int, N: int) -> int: print("The term you searched for is not in the corpus.") -def tf_idf(tf : int, idf: int) -> int: +def tf_idf(tf : int, idf: int) -> float: """ A function that combines the term frequency and inverse document frequency functions to From e8890d62f388129ac85bddb63efaf22be44adc56 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 20:27:11 -0400 Subject: [PATCH 10/21] Corrected doctests --- machine_learning/word_frequency_functions.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index f877dd839425..033a1ad47365 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -53,12 +53,6 @@ def term_frequency(term : str, document : str) -> int: >>> document = "To be, or not to be" >>> term_frequency("to", "To be, or not to be") 2 - - >>> document = "Natural Language Processing is a subfield of \ - Artificial Intelligence concerned with interactions \ - between computers and human languages" - >>> term = "NLP" - 0 """ # strip all punctuation and newlines and replace it with '' document_without_punctuation = document.translate( @@ -81,8 +75,8 @@ def document_frequency(term: str, corpus: str) -> int: searching for and the number of documents in the corpus @examples : >>> corpus = \ - "This is the first document in the corpus.\n - ThIs is the second document in the corpus.\n + "This is the first document in the corpus.\n \ + ThIs is the second document in the corpus.\n \ THIS is the third document in the corpus." >>> term = "first" 1 @@ -115,9 +109,9 @@ def inverse_document_frequency(df : int, N: int) -> float: @examples : >>> df = 1 >>> N = 3 - log10(3/1) = .477 + .477 >>> df = 3 - log10(3/3) = log10(1) = 0 + 0 >>> df = 0 log10(3/0) -> throws ZeroDivisionError """ From bcbb8f680ec415fc9deb9380e9fd0736afe843fc Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 20:57:43 -0400 Subject: [PATCH 11/21] Fix for TravisCI --- machine_learning/word_frequency_functions.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 033a1ad47365..d33adb0e8dec 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -75,9 +75,7 @@ def document_frequency(term: str, corpus: str) -> int: searching for and the number of documents in the corpus @examples : >>> corpus = \ - "This is the first document in the corpus.\n \ - ThIs is the second document in the corpus.\n \ - THIS is the third document in the corpus." + "This is the first document in the corpus.\n ThIs is the second document in the corpus. \n THIS is the third document in the corpus." >>> term = "first" 1 >>> term = "document" @@ -110,10 +108,6 @@ def inverse_document_frequency(df : int, N: int) -> float: >>> df = 1 >>> N = 3 .477 - >>> df = 3 - 0 - >>> df = 0 - log10(3/0) -> throws ZeroDivisionError """ try: idf = round(log10(N / df), 3) From a2628d47bcc6ade369b8bd9211c2a0ba56848834 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 21:10:37 -0400 Subject: [PATCH 12/21] Fix for TravisCI V2 --- machine_learning/word_frequency_functions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index d33adb0e8dec..22627e20e018 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -75,7 +75,9 @@ def document_frequency(term: str, corpus: str) -> int: searching for and the number of documents in the corpus @examples : >>> corpus = \ - "This is the first document in the corpus.\n ThIs is the second document in the corpus. \n THIS is the third document in the corpus." + "This is the first document in the corpus.\n ThIs is \ + the second document in the corpus. \n THIS is \ + the third document in the corpus." >>> term = "first" 1 >>> term = "document" From a0bef59b3aa392a40e7afbb93e10198ac76aebf4 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Tue, 23 Jun 2020 11:42:37 -0400 Subject: [PATCH 13/21] Tests passing locally --- machine_learning/word_frequency_functions.py | 23 ++++++++------------ 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 22627e20e018..7abcbf2b4f08 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -50,7 +50,6 @@ def term_frequency(term : str, document : str) -> int: found within the document @examples: - >>> document = "To be, or not to be" >>> term_frequency("to", "To be, or not to be") 2 """ @@ -74,16 +73,10 @@ def document_frequency(term: str, corpus: str) -> int: @returns : the number of documents in the corpus that contain the term you are searching for and the number of documents in the corpus @examples : - >>> corpus = \ - "This is the first document in the corpus.\n ThIs is \ - the second document in the corpus. \n THIS is \ - the third document in the corpus." - >>> term = "first" - 1 - >>> term = "document" - 3 - >>> term = "this" - 3 + >>> document_frequency("first", "This is the first document in the corpus.\\nThIs is\ +the second document in the corpus.\\nTHIS is \ +the third document in the corpus.") + (1, 3) """ corpus_without_punctuation = corpus.translate( str.maketrans("", "", string.punctuation) @@ -107,9 +100,8 @@ def inverse_document_frequency(df : int, N: int) -> float: the number of documents in the corpus. @returns : log10(N/df) @examples : - >>> df = 1 - >>> N = 3 - .477 + >>> inverse_document_frequency(1, 3) + 0.477 """ try: idf = round(log10(N / df), 3) @@ -128,5 +120,8 @@ def tf_idf(tf : int, idf: int) -> float: frequency : tf-idf = TF * IDF @params : tf, the term frequency, and idf, the inverse document frequency + @examples : + >>> tf_idf(2, 0.477) + 0.954 """ return round(tf * idf, 3) From 4cd803ae586e22fab762cf822d3dfdb40922a101 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Tue, 23 Jun 2020 11:49:36 -0400 Subject: [PATCH 14/21] Tests passing locally --- machine_learning/word_frequency_functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 7abcbf2b4f08..acf72f80f4b8 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -73,8 +73,8 @@ def document_frequency(term: str, corpus: str) -> int: @returns : the number of documents in the corpus that contain the term you are searching for and the number of documents in the corpus @examples : - >>> document_frequency("first", "This is the first document in the corpus.\\nThIs is\ -the second document in the corpus.\\nTHIS is \ + >>> document_frequency("first", "This is the first document in the corpus.\\nThIs\ +is the second document in the corpus.\\nTHIS is \ the third document in the corpus.") (1, 3) """ @@ -120,7 +120,7 @@ def tf_idf(tf : int, idf: int) -> float: frequency : tf-idf = TF * IDF @params : tf, the term frequency, and idf, the inverse document frequency - @examples : + @examples : >>> tf_idf(2, 0.477) 0.954 """ From fcc07c930ecb616c27d028f98169406c8def7f52 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 15:37:30 -0400 Subject: [PATCH 15/21] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index acf72f80f4b8..6b3fc43f812d 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -66,7 +66,7 @@ def term_frequency(term : str, document : str) -> int: def document_frequency(term: str, corpus: str) -> int: """ - A function that calculates the number of documents in a corpus that contain a + Calculate the number of documents in a corpus that contain a given term @params : term, the term to search each document for, and corpus, a collection of documents. Each document should be separated by a newline. From d35b5a69048eb01393558bcc355a15c11a69d41a Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 15:37:36 -0400 Subject: [PATCH 16/21] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 6b3fc43f812d..fc854fd5f67b 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -91,7 +91,7 @@ def document_frequency(term: str, corpus: str) -> int: def inverse_document_frequency(df : int, N: int) -> float: """ - A function that returns an integer denoting the importance + Return an integer denoting the importance of a word. This measure of importance is calculated by log10(N/df), where N is the number of documents and df is From e901e096204eb00176c79fa2b3e7dd8f49118cf4 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 15:37:45 -0400 Subject: [PATCH 17/21] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index fc854fd5f67b..faae5a81c6eb 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -42,7 +42,7 @@ def term_frequency(term : str, document : str) -> int: """ - A function that returns the number of times a term occurs within + Return the number of times a term occurs within a given document. @params: term, the term to search a document for, and document, the document to search within From 0a85c0fbe1375f17cbde80f14764c49c08db4cd8 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 15:37:51 -0400 Subject: [PATCH 18/21] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index faae5a81c6eb..4434a6de808b 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -112,7 +112,7 @@ def inverse_document_frequency(df : int, N: int) -> float: def tf_idf(tf : int, idf: int) -> float: """ - A function that combines the term frequency + Combine the term frequency and inverse document frequency functions to calculate the originality of a term. This 'originality' is calculated by multiplying From fcef21e5889f601279f9d269ddfa322a60e01ad0 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 16:15:30 -0400 Subject: [PATCH 19/21] Add doctest examples and clean up docstrings --- machine_learning/word_frequency_functions.py | 34 ++++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index acf72f80f4b8..a105e30f5d3b 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -42,7 +42,7 @@ def term_frequency(term : str, document : str) -> int: """ - A function that returns the number of times a term occurs within + Return the number of times a term occurs within a given document. @params: term, the term to search a document for, and document, the document to search within @@ -58,15 +58,14 @@ def term_frequency(term : str, document : str) -> int: str.maketrans("", "", string.punctuation) ).replace("\n", "") tokenize_document = document_without_punctuation.split(" ") # word tokenization - term_frequency = len( + return len( [word for word in tokenize_document if word.lower() == term.lower()] ) - return term_frequency def document_frequency(term: str, corpus: str) -> int: """ - A function that calculates the number of documents in a corpus that contain a + Calculate the number of documents in a corpus that contain a given term @params : term, the term to search each document for, and corpus, a collection of documents. Each document should be separated by a newline. @@ -83,15 +82,14 @@ def document_frequency(term: str, corpus: str) -> int: ) # strip all punctuation and replace it with '' documents = corpus_without_punctuation.split("\n") lowercase_documents = [document.lower() for document in documents] - document_frequency = len( + return len( [document for document in lowercase_documents if term.lower() in document] - ) # number of documents that contain the term - return document_frequency, len(documents) + ), len(documents) def inverse_document_frequency(df : int, N: int) -> float: """ - A function that returns an integer denoting the importance + Return an integer denoting the importance of a word. This measure of importance is calculated by log10(N/df), where N is the number of documents and df is @@ -100,19 +98,27 @@ def inverse_document_frequency(df : int, N: int) -> float: the number of documents in the corpus. @returns : log10(N/df) @examples : + >>> inverse_document_frequency(3, 0) + Traceback (most recent call last): + ... + ValueError: log10(0) is undefined. >>> inverse_document_frequency(1, 3) 0.477 + >>> inverse_document_frequency(0, 3) + Traceback (most recent call last): + ... + ZeroDivisionError: df must be > 0 """ - try: - idf = round(log10(N / df), 3) - return idf - except ZeroDivisionError: - print("The term you searched for is not in the corpus.") + if df == 0: + raise ZeroDivisionError("df must be > 0") + elif N == 0: + raise ValueError("log10(0) is undefined.") + return round(log10(N / df), 3) def tf_idf(tf : int, idf: int) -> float: """ - A function that combines the term frequency + Combine the term frequency and inverse document frequency functions to calculate the originality of a term. This 'originality' is calculated by multiplying From f669051e95d3e63006ae9338dfe2022811a89e15 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Thu, 9 Jul 2020 14:57:38 -0400 Subject: [PATCH 20/21] Added Standardization and Normalization algorithms --- machine_learning/data_transformations.py | 101 +++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 machine_learning/data_transformations.py diff --git a/machine_learning/data_transformations.py b/machine_learning/data_transformations.py new file mode 100644 index 000000000000..ff3f8d53ab81 --- /dev/null +++ b/machine_learning/data_transformations.py @@ -0,0 +1,101 @@ +""" + Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization + Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization + + Normalization is the process of converting numerical data to a standard + range of values. This range is typically between [0, 1] or [-1, 1]. + The equation for normalization is x_norm = (x - x_min)/(x_max - x_min) + where x_norm is the normalized value, x is the value, x_min is the + minimum value within the column or list of data, and x_max is the + maximum value within the column or list of data. Normalization is + used to speed up the training of data and put all of the data + on a similar scale. This is useful because variance in the range of + values of a dataset can heavily impact optimization + (particularly Gradient Descent). + + Standardization is the process of converting numerical data to a normally + distributed range of values. This range will have a mean of 0 and standard + deviation of 1. This is also known as z-score normalization. The equation for + standardization is x_std = (x - mu)/(sigma) where mu is the mean of the + column or list of values and sigma is the standard deviation of the column + or list of values. + + Choosing between Normalization & Standardization is more of an art of a science, + but it is often recommended to run experiments with both to see which performs + better. Additionally, a few rules of thumb are: + 1. gaussian (normal) distributions work better with standardization + 2. non-gaussian (non-normal) distributions work better with normalization + 3. If a column or list of values has extreme values / outliers, use + standardization +""" + + +def normalization(data : list) -> list: + """ + Returns a normalized list of values + @params: data, a list of values to normalize + @returns: a list of normalized values (rounded to 3 decimals) + @examples: + >>> normalization([2, 7, 10, 20, 30, 50]) + [0.0, 0.104, 0.167, 0.375, 0.583, 1.0] + + >>> normalization([5, 10, 15, 20, 25]) + [0.0, 0.25, 0.5, 0.75, 1.0] + """ + # variables for calculation + x_min = min(data) + x_max = max(data) + # normalize data + return [round((x - x_min) / (x_max - x_min), 3) for x in data] + + +def standardization(data : list) -> list: + """ + Returns a standardized list of values + @params: data, a list of values to standardize + @returns: a list of standardized values (rounded to 3 decimals) + @examples: + >>> standardization([2, 7, 10, 20, 30, 50]) + [-1.095, -0.788, -0.604, 0.01, 0.624, 1.852] + + >>> standardization([5, 10, 15, 20, 25]) + [-1.414, -0.707, 0.0, 0.707, 1.414] + """ + # variables for calculation + mu = mean(data) + sigma = stdDeviation(data) + + # standardize data + return [round((x - mu) / (sigma), 3) for x in data] + + +def mean(data : list) -> float: + """ + Helper function that returns the mean of a list of values + @params: data, a list of values + @returns: a float representing the mean (rounded to 3 decimals) + @examples: + >>> mean([2, 7, 10, 20, 30, 50]) + 19.833 + + >>> mean([5, 10, 15, 20, 25]) + 15.0 + """ + return round(sum(data) / len(data), 3) + + +def stdDeviation(data : list) -> float: + """ + Helper function that returns the standard deviation of a list of values + @params: data, a list of values + @returns: a float representing the standard deviation (rounded to 3 values) + @examples: + >>> stdDeviation([2, 7, 10, 20, 30, 50]) + 16.293 + + >>> stdDeviation([5, 10, 15, 20, 25]) + 7.071 + """ + x_mean = mean(data) + sum_squared_diff = sum([(x - x_mean)**2 for x in data]) + return round(((sum_squared_diff) / len(data))**.5, 3) From f273d6a6b9986110b9060db0c4122b0029bc086b Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Thu, 9 Jul 2020 15:18:38 -0400 Subject: [PATCH 21/21] Delete word_frequency_functions.py --- machine_learning/word_frequency_functions.py | 133 ------------------- 1 file changed, 133 deletions(-) delete mode 100644 machine_learning/word_frequency_functions.py diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py deleted file mode 100644 index 250741146623..000000000000 --- a/machine_learning/word_frequency_functions.py +++ /dev/null @@ -1,133 +0,0 @@ -import string -from math import log10 - -""" - tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf - tf-idf and other word frequency algorithms are often used - as a weighting factor in information retrieval and text - mining. 83% of text-based recommender systems use - tf-idf for term weighting. In Layman's terms, tf-idf - is a statistic intended to reflect how important a word - is to a document in a corpus (a collection of documents) - - - Here I've implemented several word frequency algorithms - that are commonly used in information retrieval: Term Frequency, - Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency) - are included. - - Term Frequency is a statistical function that - returns a number representing how frequently - an expression occurs in a document. This - indicates how significant a particular term is in - a given document. - - Document Frequency is a statistical function that returns - an integer representing the number of documents in a - corpus that a term occurs in (where the max number returned - would be the number of documents in the corpus). - - Inverse Document Frequency is mathematically written as - log10(N/df), where N is the number of documents in your - corpus and df is the Document Frequency. If df is 0, a - ZeroDivisionError will be thrown. - - Term-Frequency*Inverse-Document-Frequency is a measure - of the originality of a term. It is mathematically written - as tf*log10(N/df). It compares the number of times - a term appears in a document with the number of documents - the term appears in. If df is 0, a ZeroDivisionError will be thrown. -""" - - -def term_frequency(term: str, document: str) -> int: - """ - Return the number of times a term occurs within - a given document. - @params: term, the term to search a document for, and document, - the document to search within - @returns: an integer representing the number of times a term is - found within the document - - @examples: - >>> term_frequency("to", "To be, or not to be") - 2 - """ - # strip all punctuation and newlines and replace it with '' - document_without_punctuation = document.translate( - str.maketrans("", "", string.punctuation) - ).replace("\n", "") - tokenize_document = document_without_punctuation.split(" ") # word tokenization - return len( - [word for word in tokenize_document if word.lower() == term.lower()] - ) - - -def document_frequency(term: str, corpus: str) -> int: - """ - Calculate the number of documents in a corpus that contain a - given term - @params : term, the term to search each document for, and corpus, a collection of - documents. Each document should be separated by a newline. - @returns : the number of documents in the corpus that contain the term you are - searching for and the number of documents in the corpus - @examples : - >>> document_frequency("first", "This is the first document in the corpus.\\nThIs\ -is the second document in the corpus.\\nTHIS is \ -the third document in the corpus.") - (1, 3) - """ - corpus_without_punctuation = corpus.translate( - str.maketrans("", "", string.punctuation) - ) # strip all punctuation and replace it with '' - documents = corpus_without_punctuation.split("\n") - lowercase_documents = [document.lower() for document in documents] - return len( - [document for document in lowercase_documents if term.lower() in document] - ), len(documents) - - -def inverse_document_frequency(df: int, N: int) -> float: - """ - Return an integer denoting the importance - of a word. This measure of importance is - calculated by log10(N/df), where N is the - number of documents and df is - the Document Frequency. - @params : df, the Document Frequency, and N, - the number of documents in the corpus. - @returns : log10(N/df) - @examples : - >>> inverse_document_frequency(3, 0) - Traceback (most recent call last): - ... - ValueError: log10(0) is undefined. - >>> inverse_document_frequency(1, 3) - 0.477 - >>> inverse_document_frequency(0, 3) - Traceback (most recent call last): - ... - ZeroDivisionError: df must be > 0 - """ - if df == 0: - raise ZeroDivisionError("df must be > 0") - elif N == 0: - raise ValueError("log10(0) is undefined.") - return round(log10(N / df), 3) - - -def tf_idf(tf: int, idf: int) -> float: - """ - Combine the term frequency - and inverse document frequency functions to - calculate the originality of a term. This - 'originality' is calculated by multiplying - the term frequency and the inverse document - frequency : tf-idf = TF * IDF - @params : tf, the term frequency, and idf, the inverse document - frequency - @examples : - >>> tf_idf(2, 0.477) - 0.954 - """ - return round(tf * idf, 3) \ No newline at end of file