From 5f122045c0ced182dec3d6b2370f8bf28825a2a2 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Thu, 9 Jul 2020 16:37:59 -0400 Subject: [PATCH 1/2] Added Standardization and Normalization algorithms with built-in stats --- machine_learning/data_transformations.py | 70 ++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 machine_learning/data_transformations.py diff --git a/machine_learning/data_transformations.py b/machine_learning/data_transformations.py new file mode 100644 index 000000000000..7a4c94657902 --- /dev/null +++ b/machine_learning/data_transformations.py @@ -0,0 +1,70 @@ +from statistics import mean, stdev +""" + Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization + Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization + + Normalization is the process of converting numerical data to a standard + range of values. This range is typically between [0, 1] or [-1, 1]. + The equation for normalization is x_norm = (x - x_min)/(x_max - x_min) + where x_norm is the normalized value, x is the value, x_min is the + minimum value within the column or list of data, and x_max is the + maximum value within the column or list of data. Normalization is + used to speed up the training of data and put all of the data + on a similar scale. This is useful because variance in the range of + values of a dataset can heavily impact optimization + (particularly Gradient Descent). + + Standardization is the process of converting numerical data to a normally + distributed range of values. This range will have a mean of 0 and standard + deviation of 1. This is also known as z-score normalization. The equation for + standardization is x_std = (x - mu)/(sigma) where mu is the mean of the + column or list of values and sigma is the standard deviation of the column + or list of values. + + Choosing between Normalization & Standardization is more of an art of a science, + but it is often recommended to run experiments with both to see which performs + better. Additionally, a few rules of thumb are: + 1. gaussian (normal) distributions work better with standardization + 2. non-gaussian (non-normal) distributions work better with normalization + 3. If a column or list of values has extreme values / outliers, use + standardization +""" + + +def normalization(data : list) -> list: + """ + Returns a normalized list of values + @params: data, a list of values to normalize + @returns: a list of normalized values (rounded to 3 decimals) + @examples: + >>> normalization([2, 7, 10, 20, 30, 50]) + [0.0, 0.104, 0.167, 0.375, 0.583, 1.0] + + >>> normalization([5, 10, 15, 20, 25]) + [0.0, 0.25, 0.5, 0.75, 1.0] + """ + # variables for calculation + x_min = min(data) + x_max = max(data) + # normalize data + return [round((x - x_min) / (x_max - x_min), 3) for x in data] + + +def standardization(data : list) -> list: + """ + Returns a standardized list of values + @params: data, a list of values to standardize + @returns: a list of standardized values (rounded to 3 decimals) + @examples: + >>> standardization([2, 7, 10, 20, 30, 50]) + [-0.999, -0.719, -0.551, 0.009, 0.57, 1.69] + + >>> standardization([5, 10, 15, 20, 25]) + [-1.265, -0.632, 0.0, 0.632, 1.265] + """ + # variables for calculation + mu = mean(data) + sigma = stdev(data) + + # standardize data + return [round((x - mu) / (sigma), 3) for x in data] From 94a8533807159aeb046616ab9269b90edbcb1eb3 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 10 Jul 2020 15:17:27 +0200 Subject: [PATCH 2/2] Implement ndigits for rounding --- machine_learning/data_transformations.py | 68 +++++++++++------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/machine_learning/data_transformations.py b/machine_learning/data_transformations.py index 7a4c94657902..9e0d747e93fa 100644 --- a/machine_learning/data_transformations.py +++ b/machine_learning/data_transformations.py @@ -1,45 +1,39 @@ -from statistics import mean, stdev """ - Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization - Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization - - Normalization is the process of converting numerical data to a standard - range of values. This range is typically between [0, 1] or [-1, 1]. - The equation for normalization is x_norm = (x - x_min)/(x_max - x_min) - where x_norm is the normalized value, x is the value, x_min is the - minimum value within the column or list of data, and x_max is the - maximum value within the column or list of data. Normalization is - used to speed up the training of data and put all of the data - on a similar scale. This is useful because variance in the range of - values of a dataset can heavily impact optimization - (particularly Gradient Descent). - - Standardization is the process of converting numerical data to a normally - distributed range of values. This range will have a mean of 0 and standard - deviation of 1. This is also known as z-score normalization. The equation for - standardization is x_std = (x - mu)/(sigma) where mu is the mean of the - column or list of values and sigma is the standard deviation of the column - or list of values. - - Choosing between Normalization & Standardization is more of an art of a science, - but it is often recommended to run experiments with both to see which performs - better. Additionally, a few rules of thumb are: - 1. gaussian (normal) distributions work better with standardization - 2. non-gaussian (non-normal) distributions work better with normalization - 3. If a column or list of values has extreme values / outliers, use - standardization +Normalization Wikipedia: https://en.wikipedia.org/wiki/Normalization +Normalization is the process of converting numerical data to a standard range of values. +This range is typically between [0, 1] or [-1, 1]. The equation for normalization is +x_norm = (x - x_min)/(x_max - x_min) where x_norm is the normalized value, x is the +value, x_min is the minimum value within the column or list of data, and x_max is the +maximum value within the column or list of data. Normalization is used to speed up the +training of data and put all of the data on a similar scale. This is useful because +variance in the range of values of a dataset can heavily impact optimization +(particularly Gradient Descent). + +Standardization Wikipedia: https://en.wikipedia.org/wiki/Standardization +Standardization is the process of converting numerical data to a normally distributed +range of values. This range will have a mean of 0 and standard deviation of 1. This is +also known as z-score normalization. The equation for standardization is +x_std = (x - mu)/(sigma) where mu is the mean of the column or list of values and sigma +is the standard deviation of the column or list of values. + +Choosing between Normalization & Standardization is more of an art of a science, but it +is often recommended to run experiments with both to see which performs better. +Additionally, a few rules of thumb are: + 1. gaussian (normal) distributions work better with standardization + 2. non-gaussian (non-normal) distributions work better with normalization + 3. If a column or list of values has extreme values / outliers, use standardization """ +from statistics import mean, stdev -def normalization(data : list) -> list: +def normalization(data: list, ndigits: int = 3) -> list: """ Returns a normalized list of values @params: data, a list of values to normalize - @returns: a list of normalized values (rounded to 3 decimals) + @returns: a list of normalized values (rounded to ndigits decimal places) @examples: >>> normalization([2, 7, 10, 20, 30, 50]) [0.0, 0.104, 0.167, 0.375, 0.583, 1.0] - >>> normalization([5, 10, 15, 20, 25]) [0.0, 0.25, 0.5, 0.75, 1.0] """ @@ -47,24 +41,22 @@ def normalization(data : list) -> list: x_min = min(data) x_max = max(data) # normalize data - return [round((x - x_min) / (x_max - x_min), 3) for x in data] + return [round((x - x_min) / (x_max - x_min), ndigits) for x in data] -def standardization(data : list) -> list: +def standardization(data: list, ndigits: int = 3) -> list: """ Returns a standardized list of values @params: data, a list of values to standardize - @returns: a list of standardized values (rounded to 3 decimals) + @returns: a list of standardized values (rounded to ndigits decimal places) @examples: >>> standardization([2, 7, 10, 20, 30, 50]) [-0.999, -0.719, -0.551, 0.009, 0.57, 1.69] - >>> standardization([5, 10, 15, 20, 25]) [-1.265, -0.632, 0.0, 0.632, 1.265] """ # variables for calculation mu = mean(data) sigma = stdev(data) - # standardize data - return [round((x - mu) / (sigma), 3) for x in data] + return [round((x - mu) / (sigma), ndigits) for x in data]