diff --git a/machine_learning/data_transformations.py b/machine_learning/data_transformations.py
new file mode 100644
index 000000000000..ff3f8d53ab81
--- /dev/null
+++ b/machine_learning/data_transformations.py
@@ -0,0 +1,101 @@
+"""
+    Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization
+    Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization
+
+    Normalization is the process of converting numerical data to a standard
+    range of values. This range is typically between [0, 1] or [-1, 1].
+    The equation for normalization is x_norm = (x - x_min)/(x_max - x_min)
+    where x_norm is the normalized value, x is the value, x_min is the
+    minimum value within the column or list of data, and x_max is the
+    maximum value within the column or list of data. Normalization is
+    used to speed up the training of data and put all of the data
+    on a similar scale. This is useful because variance in the range of
+    values of a dataset can heavily impact optimization
+    (particularly Gradient Descent).
+
+    Standardization is the process of converting numerical data to a normally
+    distributed range of values. This range will have a mean of 0 and standard
+    deviation of 1. This is also known as z-score normalization. The equation for
+    standardization is x_std = (x - mu)/(sigma) where mu is the mean of the
+    column or list of values and sigma is the standard deviation of the column
+    or list of values.
+
+    Choosing between Normalization & Standardization is more of an art of a science,
+    but it is often recommended to run experiments with both to see which performs
+    better. Additionally, a few rules of thumb are:
+        1. gaussian (normal) distributions work better with standardization
+        2. non-gaussian (non-normal) distributions work better with normalization
+        3. If a column or list of values has extreme values / outliers, use
+        standardization
+"""
+
+
+def normalization(data : list) -> list:
+    """
+    Returns a normalized list of values
+    @params: data, a list of values to normalize
+    @returns: a list of normalized values (rounded to 3 decimals)
+    @examples:
+    >>> normalization([2, 7, 10, 20, 30, 50])
+    [0.0, 0.104, 0.167, 0.375, 0.583, 1.0]
+
+    >>> normalization([5, 10, 15, 20, 25])
+    [0.0, 0.25, 0.5, 0.75, 1.0]
+    """
+    # variables for calculation
+    x_min = min(data)
+    x_max = max(data)
+    # normalize data
+    return [round((x - x_min) / (x_max - x_min), 3) for x in data]
+
+
+def standardization(data : list) -> list:
+    """
+    Returns a standardized list of values
+    @params: data, a list of values to standardize
+    @returns: a list of standardized values (rounded to 3 decimals)
+    @examples:
+    >>> standardization([2, 7, 10, 20, 30, 50])
+    [-1.095, -0.788, -0.604, 0.01, 0.624, 1.852]
+
+    >>> standardization([5, 10, 15, 20, 25])
+    [-1.414, -0.707, 0.0, 0.707, 1.414]
+    """
+    # variables for calculation
+    mu = mean(data)
+    sigma = stdDeviation(data)
+
+    # standardize data
+    return [round((x - mu) / (sigma), 3) for x in data]
+
+
+def mean(data : list) -> float:
+    """
+    Helper function that returns the mean of a list of values
+    @params: data, a list of values
+    @returns: a float representing the mean (rounded to 3 decimals)
+    @examples:
+    >>> mean([2, 7, 10, 20, 30, 50])
+    19.833
+
+    >>> mean([5, 10, 15, 20, 25])
+    15.0
+    """
+    return round(sum(data) / len(data), 3)
+
+
+def stdDeviation(data : list) -> float:
+    """
+    Helper function that returns the standard deviation of a list of values
+    @params: data, a list of values
+    @returns: a float representing the standard deviation (rounded to 3 values)
+    @examples:
+    >>> stdDeviation([2, 7, 10, 20, 30, 50])
+    16.293
+
+    >>> stdDeviation([5, 10, 15, 20, 25])
+    7.071
+    """
+    x_mean = mean(data)
+    sum_squared_diff = sum([(x - x_mean)**2 for x in data])
+    return round(((sum_squared_diff) / len(data))**.5, 3)
diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py
deleted file mode 100644
index e9e9e644b7d8..000000000000
--- a/machine_learning/word_frequency_functions.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import string
-from math import log10
-
-"""
-    tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
-    tf-idf and other word frequency algorithms are often used
-    as a weighting factor in information retrieval and text
-    mining. 83% of text-based recommender systems use
-    tf-idf for term weighting. In Layman's terms, tf-idf
-    is a statistic intended to reflect how important a word
-    is to a document in a corpus (a collection of documents)
-
-
-    Here I've implemented several word frequency algorithms
-    that are commonly used in information retrieval: Term Frequency,
-    Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency)
-    are included.
-
-    Term Frequency is a statistical function that
-    returns a number representing how frequently
-    an expression occurs in a document. This
-    indicates how significant a particular term is in
-    a given document.
-
-    Document Frequency is a statistical function that returns
-    an integer representing the number of documents in a
-    corpus that a term occurs in (where the max number returned
-    would be the number of documents in the corpus).
-
-    Inverse Document Frequency is mathematically written as
-    log10(N/df), where N is the number of documents in your
-    corpus and df is the Document Frequency. If df is 0, a
-    ZeroDivisionError will be thrown.
-
-    Term-Frequency*Inverse-Document-Frequency is a measure
-    of the originality of a term. It is mathematically written
-    as tf*log10(N/df). It compares the number of times
-    a term appears in a document with the number of documents
-    the term appears in. If df is 0, a ZeroDivisionError will be thrown.
-"""
-
-
-def term_frequency(term: str, document: str) -> int:
-    """
-    Return the number of times a term occurs within
-    a given document.
-    @params: term, the term to search a document for, and document,
-            the document to search within
-    @returns: an integer representing the number of times a term is
-            found within the document
-
-    @examples:
-    >>> term_frequency("to", "To be, or not to be")
-    2
-    """
-    # strip all punctuation and newlines and replace it with ''
-    document_without_punctuation = document.translate(
-        str.maketrans("", "", string.punctuation)
-    ).replace("\n", "")
-    tokenize_document = document_without_punctuation.split(" ")  # word tokenization
-    return len([word for word in tokenize_document if word.lower() == term.lower()])
-
-
-def document_frequency(term: str, corpus: str) -> int:
-    """
-    Calculate the number of documents in a corpus that contain a
-    given term
-    @params : term, the term to search each document for, and corpus, a collection of
-             documents. Each document should be separated by a newline.
-    @returns : the number of documents in the corpus that contain the term you are
-               searching for and the number of documents in the corpus
-    @examples :
-    >>> document_frequency("first", "This is the first document in the corpus.\\nThIs\
-is the second document in the corpus.\\nTHIS is \
-the third document in the corpus.")
-    (1, 3)
-    """
-    corpus_without_punctuation = corpus.lower().translate(
-        str.maketrans("", "", string.punctuation)
-    )  # strip all punctuation and replace it with ''
-    docs = corpus_without_punctuation.split("\n")
-    term = term.lower()
-    return (len([doc for doc in docs if term in doc]), len(docs))
-
-
-def inverse_document_frequency(df: int, N: int) -> float:
-    """
-    Return an integer denoting the importance
-    of a word. This measure of importance is
-    calculated by log10(N/df), where N is the
-    number of documents and df is
-    the Document Frequency.
-    @params : df, the Document Frequency, and N,
-    the number of documents in the corpus.
-    @returns : log10(N/df)
-    @examples :
-    >>> inverse_document_frequency(3, 0)
-    Traceback (most recent call last):
-     ...
-    ValueError: log10(0) is undefined.
-    >>> inverse_document_frequency(1, 3)
-    0.477
-    >>> inverse_document_frequency(0, 3)
-    Traceback (most recent call last):
-     ...
-    ZeroDivisionError: df must be > 0
-    """
-    if df == 0:
-        raise ZeroDivisionError("df must be > 0")
-    elif N == 0:
-        raise ValueError("log10(0) is undefined.")
-    return round(log10(N / df), 3)
-
-
-def tf_idf(tf: int, idf: int) -> float:
-    """
-    Combine the term frequency
-    and inverse document frequency functions to
-    calculate the originality of a term. This
-    'originality' is calculated by multiplying
-    the term frequency and the inverse document
-    frequency : tf-idf = TF * IDF
-    @params : tf, the term frequency, and idf, the inverse document
-    frequency
-    @examples :
-    >>> tf_idf(2, 0.477)
-    0.954
-    """
-    return round(tf * idf, 3)