From 5f122045c0ced182dec3d6b2370f8bf28825a2a2 Mon Sep 17 00:00:00 2001
From: Dan Murphy <danielmurph8@gmail.com>
Date: Thu, 9 Jul 2020 16:37:59 -0400
Subject: [PATCH 1/2] Added Standardization and Normalization algorithms with
 built-in stats

---
 machine_learning/data_transformations.py | 70 ++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 machine_learning/data_transformations.py

diff --git a/machine_learning/data_transformations.py b/machine_learning/data_transformations.py
new file mode 100644
index 000000000000..7a4c94657902
--- /dev/null
+++ b/machine_learning/data_transformations.py
@@ -0,0 +1,70 @@
+from statistics import mean, stdev
+"""
+    Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization
+    Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization
+
+    Normalization is the process of converting numerical data to a standard
+    range of values. This range is typically between [0, 1] or [-1, 1].
+    The equation for normalization is x_norm = (x - x_min)/(x_max - x_min)
+    where x_norm is the normalized value, x is the value, x_min is the
+    minimum value within the column or list of data, and x_max is the
+    maximum value within the column or list of data. Normalization is
+    used to speed up the training of data and put all of the data
+    on a similar scale. This is useful because variance in the range of
+    values of a dataset can heavily impact optimization
+    (particularly Gradient Descent).
+
+    Standardization is the process of converting numerical data to a normally
+    distributed range of values. This range will have a mean of 0 and standard
+    deviation of 1. This is also known as z-score normalization. The equation for
+    standardization is x_std = (x - mu)/(sigma) where mu is the mean of the
+    column or list of values and sigma is the standard deviation of the column
+    or list of values.
+
+    Choosing between Normalization & Standardization is more of an art of a science,
+    but it is often recommended to run experiments with both to see which performs
+    better. Additionally, a few rules of thumb are:
+        1. gaussian (normal) distributions work better with standardization
+        2. non-gaussian (non-normal) distributions work better with normalization
+        3. If a column or list of values has extreme values / outliers, use
+        standardization
+"""
+
+
+def normalization(data : list) -> list:
+    """
+    Returns a normalized list of values
+    @params: data, a list of values to normalize
+    @returns: a list of normalized values (rounded to 3 decimals)
+    @examples:
+    >>> normalization([2, 7, 10, 20, 30, 50])
+    [0.0, 0.104, 0.167, 0.375, 0.583, 1.0]
+
+    >>> normalization([5, 10, 15, 20, 25])
+    [0.0, 0.25, 0.5, 0.75, 1.0]
+    """
+    # variables for calculation
+    x_min = min(data)
+    x_max = max(data)
+    # normalize data
+    return [round((x - x_min) / (x_max - x_min), 3) for x in data]
+
+
+def standardization(data : list) -> list:
+    """
+    Returns a standardized list of values
+    @params: data, a list of values to standardize
+    @returns: a list of standardized values (rounded to 3 decimals)
+    @examples:
+    >>> standardization([2, 7, 10, 20, 30, 50])
+    [-0.999, -0.719, -0.551, 0.009, 0.57, 1.69]
+
+    >>> standardization([5, 10, 15, 20, 25])
+    [-1.265, -0.632, 0.0, 0.632, 1.265]
+    """
+    # variables for calculation
+    mu = mean(data)
+    sigma = stdev(data)
+
+    # standardize data
+    return [round((x - mu) / (sigma), 3) for x in data]

From 94a8533807159aeb046616ab9269b90edbcb1eb3 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Fri, 10 Jul 2020 15:17:27 +0200
Subject: [PATCH 2/2] Implement ndigits for rounding

---
 machine_learning/data_transformations.py | 68 +++++++++++-------------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/machine_learning/data_transformations.py b/machine_learning/data_transformations.py
index 7a4c94657902..9e0d747e93fa 100644
--- a/machine_learning/data_transformations.py
+++ b/machine_learning/data_transformations.py
@@ -1,45 +1,39 @@
-from statistics import mean, stdev
 """
-    Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization
-    Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization
-
-    Normalization is the process of converting numerical data to a standard
-    range of values. This range is typically between [0, 1] or [-1, 1].
-    The equation for normalization is x_norm = (x - x_min)/(x_max - x_min)
-    where x_norm is the normalized value, x is the value, x_min is the
-    minimum value within the column or list of data, and x_max is the
-    maximum value within the column or list of data. Normalization is
-    used to speed up the training of data and put all of the data
-    on a similar scale. This is useful because variance in the range of
-    values of a dataset can heavily impact optimization
-    (particularly Gradient Descent).
-
-    Standardization is the process of converting numerical data to a normally
-    distributed range of values. This range will have a mean of 0 and standard
-    deviation of 1. This is also known as z-score normalization. The equation for
-    standardization is x_std = (x - mu)/(sigma) where mu is the mean of the
-    column or list of values and sigma is the standard deviation of the column
-    or list of values.
-
-    Choosing between Normalization & Standardization is more of an art of a science,
-    but it is often recommended to run experiments with both to see which performs
-    better. Additionally, a few rules of thumb are:
-        1. gaussian (normal) distributions work better with standardization
-        2. non-gaussian (non-normal) distributions work better with normalization
-        3. If a column or list of values has extreme values / outliers, use
-        standardization
+Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization
+Normalization is the process of converting numerical data to a standard range of values.
+This range is typically between [0, 1] or [-1, 1]. The equation for normalization is
+x_norm = (x - x_min)/(x_max - x_min) where x_norm is the normalized value, x is the
+value, x_min is the minimum value within the column or list of data, and x_max is the
+maximum value within the column or list of data. Normalization is used to speed up the
+training of data and put all of the data on a similar scale. This is useful because
+variance in the range of values of a dataset can heavily impact optimization
+(particularly Gradient Descent).
+
+Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization
+Standardization is the process of converting numerical data to a normally distributed
+range of values. This range will have a mean of 0 and standard deviation of 1. This is
+also known as z-score normalization. The equation for standardization is
+x_std = (x - mu)/(sigma) where mu is the mean of the column or list of values and sigma
+is the standard deviation of the column or list of values.
+
+Choosing between Normalization & Standardization is more of an art of a science, but it
+is often recommended to run experiments with both to see which performs better.
+Additionally, a few rules of thumb are:
+    1. gaussian (normal) distributions work better with standardization
+    2. non-gaussian (non-normal) distributions work better with normalization
+    3. If a column or list of values has extreme values / outliers, use standardization
 """
+from statistics import mean, stdev
 
 
-def normalization(data : list) -> list:
+def normalization(data: list, ndigits: int = 3) -> list:
     """
     Returns a normalized list of values
     @params: data, a list of values to normalize
-    @returns: a list of normalized values (rounded to 3 decimals)
+    @returns: a list of normalized values (rounded to ndigits decimal places)
     @examples:
     >>> normalization([2, 7, 10, 20, 30, 50])
     [0.0, 0.104, 0.167, 0.375, 0.583, 1.0]
-
     >>> normalization([5, 10, 15, 20, 25])
     [0.0, 0.25, 0.5, 0.75, 1.0]
     """
@@ -47,24 +41,22 @@ def normalization(data : list) -> list:
     x_min = min(data)
     x_max = max(data)
     # normalize data
-    return [round((x - x_min) / (x_max - x_min), 3) for x in data]
+    return [round((x - x_min) / (x_max - x_min), ndigits) for x in data]
 
 
-def standardization(data : list) -> list:
+def standardization(data: list, ndigits: int = 3) -> list:
     """
     Returns a standardized list of values
     @params: data, a list of values to standardize
-    @returns: a list of standardized values (rounded to 3 decimals)
+    @returns: a list of standardized values (rounded to ndigits decimal places)
     @examples:
     >>> standardization([2, 7, 10, 20, 30, 50])
     [-0.999, -0.719, -0.551, 0.009, 0.57, 1.69]
-
     >>> standardization([5, 10, 15, 20, 25])
     [-1.265, -0.632, 0.0, 0.632, 1.265]
     """
     # variables for calculation
     mu = mean(data)
     sigma = stdev(data)
-
     # standardize data
-    return [round((x - mu) / (sigma), 3) for x in data]
+    return [round((x - mu) / (sigma), ndigits) for x in data]