diff --git a/machine_learning/data_transformations.py b/machine_learning/data_transformations.py new file mode 100644 index 000000000000..ff3f8d53ab81 --- /dev/null +++ b/machine_learning/data_transformations.py @@ -0,0 +1,101 @@ +""" + Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization + Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization + + Normalization is the process of converting numerical data to a standard + range of values. This range is typically between [0, 1] or [-1, 1]. + The equation for normalization is x_norm = (x - x_min)/(x_max - x_min) + where x_norm is the normalized value, x is the value, x_min is the + minimum value within the column or list of data, and x_max is the + maximum value within the column or list of data. Normalization is + used to speed up the training of data and put all of the data + on a similar scale. This is useful because variance in the range of + values of a dataset can heavily impact optimization + (particularly Gradient Descent). + + Standardization is the process of converting numerical data to a normally + distributed range of values. This range will have a mean of 0 and standard + deviation of 1. This is also known as z-score normalization. The equation for + standardization is x_std = (x - mu)/(sigma) where mu is the mean of the + column or list of values and sigma is the standard deviation of the column + or list of values. + + Choosing between Normalization & Standardization is more of an art of a science, + but it is often recommended to run experiments with both to see which performs + better. Additionally, a few rules of thumb are: + 1. gaussian (normal) distributions work better with standardization + 2. non-gaussian (non-normal) distributions work better with normalization + 3. If a column or list of values has extreme values / outliers, use + standardization +""" + + +def normalization(data : list) -> list: + """ + Returns a normalized list of values + @params: data, a list of values to normalize + @returns: a list of normalized values (rounded to 3 decimals) + @examples: + >>> normalization([2, 7, 10, 20, 30, 50]) + [0.0, 0.104, 0.167, 0.375, 0.583, 1.0] + + >>> normalization([5, 10, 15, 20, 25]) + [0.0, 0.25, 0.5, 0.75, 1.0] + """ + # variables for calculation + x_min = min(data) + x_max = max(data) + # normalize data + return [round((x - x_min) / (x_max - x_min), 3) for x in data] + + +def standardization(data : list) -> list: + """ + Returns a standardized list of values + @params: data, a list of values to standardize + @returns: a list of standardized values (rounded to 3 decimals) + @examples: + >>> standardization([2, 7, 10, 20, 30, 50]) + [-1.095, -0.788, -0.604, 0.01, 0.624, 1.852] + + >>> standardization([5, 10, 15, 20, 25]) + [-1.414, -0.707, 0.0, 0.707, 1.414] + """ + # variables for calculation + mu = mean(data) + sigma = stdDeviation(data) + + # standardize data + return [round((x - mu) / (sigma), 3) for x in data] + + +def mean(data : list) -> float: + """ + Helper function that returns the mean of a list of values + @params: data, a list of values + @returns: a float representing the mean (rounded to 3 decimals) + @examples: + >>> mean([2, 7, 10, 20, 30, 50]) + 19.833 + + >>> mean([5, 10, 15, 20, 25]) + 15.0 + """ + return round(sum(data) / len(data), 3) + + +def stdDeviation(data : list) -> float: + """ + Helper function that returns the standard deviation of a list of values + @params: data, a list of values + @returns: a float representing the standard deviation (rounded to 3 values) + @examples: + >>> stdDeviation([2, 7, 10, 20, 30, 50]) + 16.293 + + >>> stdDeviation([5, 10, 15, 20, 25]) + 7.071 + """ + x_mean = mean(data) + sum_squared_diff = sum([(x - x_mean)**2 for x in data]) + return round(((sum_squared_diff) / len(data))**.5, 3) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py deleted file mode 100644 index e9e9e644b7d8..000000000000 --- a/machine_learning/word_frequency_functions.py +++ /dev/null @@ -1,129 +0,0 @@ -import string -from math import log10 - -""" - tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf - tf-idf and other word frequency algorithms are often used - as a weighting factor in information retrieval and text - mining. 83% of text-based recommender systems use - tf-idf for term weighting. In Layman's terms, tf-idf - is a statistic intended to reflect how important a word - is to a document in a corpus (a collection of documents) - - - Here I've implemented several word frequency algorithms - that are commonly used in information retrieval: Term Frequency, - Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency) - are included. - - Term Frequency is a statistical function that - returns a number representing how frequently - an expression occurs in a document. This - indicates how significant a particular term is in - a given document. - - Document Frequency is a statistical function that returns - an integer representing the number of documents in a - corpus that a term occurs in (where the max number returned - would be the number of documents in the corpus). - - Inverse Document Frequency is mathematically written as - log10(N/df), where N is the number of documents in your - corpus and df is the Document Frequency. If df is 0, a - ZeroDivisionError will be thrown. - - Term-Frequency*Inverse-Document-Frequency is a measure - of the originality of a term. It is mathematically written - as tf*log10(N/df). It compares the number of times - a term appears in a document with the number of documents - the term appears in. If df is 0, a ZeroDivisionError will be thrown. -""" - - -def term_frequency(term: str, document: str) -> int: - """ - Return the number of times a term occurs within - a given document. - @params: term, the term to search a document for, and document, - the document to search within - @returns: an integer representing the number of times a term is - found within the document - - @examples: - >>> term_frequency("to", "To be, or not to be") - 2 - """ - # strip all punctuation and newlines and replace it with '' - document_without_punctuation = document.translate( - str.maketrans("", "", string.punctuation) - ).replace("\n", "") - tokenize_document = document_without_punctuation.split(" ") # word tokenization - return len([word for word in tokenize_document if word.lower() == term.lower()]) - - -def document_frequency(term: str, corpus: str) -> int: - """ - Calculate the number of documents in a corpus that contain a - given term - @params : term, the term to search each document for, and corpus, a collection of - documents. Each document should be separated by a newline. - @returns : the number of documents in the corpus that contain the term you are - searching for and the number of documents in the corpus - @examples : - >>> document_frequency("first", "This is the first document in the corpus.\\nThIs\ -is the second document in the corpus.\\nTHIS is \ -the third document in the corpus.") - (1, 3) - """ - corpus_without_punctuation = corpus.lower().translate( - str.maketrans("", "", string.punctuation) - ) # strip all punctuation and replace it with '' - docs = corpus_without_punctuation.split("\n") - term = term.lower() - return (len([doc for doc in docs if term in doc]), len(docs)) - - -def inverse_document_frequency(df: int, N: int) -> float: - """ - Return an integer denoting the importance - of a word. This measure of importance is - calculated by log10(N/df), where N is the - number of documents and df is - the Document Frequency. - @params : df, the Document Frequency, and N, - the number of documents in the corpus. - @returns : log10(N/df) - @examples : - >>> inverse_document_frequency(3, 0) - Traceback (most recent call last): - ... - ValueError: log10(0) is undefined. - >>> inverse_document_frequency(1, 3) - 0.477 - >>> inverse_document_frequency(0, 3) - Traceback (most recent call last): - ... - ZeroDivisionError: df must be > 0 - """ - if df == 0: - raise ZeroDivisionError("df must be > 0") - elif N == 0: - raise ValueError("log10(0) is undefined.") - return round(log10(N / df), 3) - - -def tf_idf(tf: int, idf: int) -> float: - """ - Combine the term frequency - and inverse document frequency functions to - calculate the originality of a term. This - 'originality' is calculated by multiplying - the term frequency and the inverse document - frequency : tf-idf = TF * IDF - @params : tf, the term frequency, and idf, the inverse document - frequency - @examples : - >>> tf_idf(2, 0.477) - 0.954 - """ - return round(tf * idf, 3)