From 40a65035da1cf824b98130fca7f5a265a2a1e350 Mon Sep 17 00:00:00 2001 From: SteveKimSR Date: Thu, 5 Nov 2020 18:51:46 +0900 Subject: [PATCH 1/8] add similarity_search.py in machine_learning adding similarity_search algorithm in machine_learning --- machine_learning/similarity_search.py | 109 ++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 machine_learning/similarity_search.py diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py new file mode 100644 index 000000000000..db28e3090a53 --- /dev/null +++ b/machine_learning/similarity_search.py @@ -0,0 +1,109 @@ +""" +Simularity search is a search algorithm for finding the nearest vector from +vectors, used in natural language processing. +In this algorithm, it calculates distance with euclidean distance and +returns a list containing two data for each vector: + 1. the nearest vector + 2. distance between the vector and the nearest vector +""" +import numpy as np +import math + + +def euclidean(input_a, input_b): + """ + Calculates euclidean distance between two data. The result should be float. + >>> euclidean(0, 1) + 1.0 + >>> euclidean(np.array([0, 1]),np.array([1, 1])) + 1.0 + >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1])) + 1.0 + """ + dist = 0 + + if type(input_a) == type(input_b): + if type(input_a) != np.ndarray: + dist = pow(input_a - input_b, 2) + else: + for index in range(len(input_a)): + dist += pow(input_a[index] - input_b[index], 2) + return math.sqrt(dist) + return None + + +def similarity_search(dataset: np, value: np) -> list: + """ + :param dataset: Set containing the vectors. + :param value: vector/vectors we want to know the nearest vector from dataset. + Result will be a list containing 1. the nearest vector, 2. distance from the vector + >>> a = np.array([0, 1, 2]) + >>> b = np.array([0]) + >>> similarity_search(a, b) + [[0, 0.0]] + + >>> a = np.array([[0, 0], [1, 1], [2, 2]]) + >>> b = np.array([[0, 1]]) + >>> similarity_search(a, b) + [[[0, 0], 1.0]] + + >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) + >>> b = np.array([[0, 0, 1]]) + >>> similarity_search(a, b) + [[[0, 0, 0], 1.0]] + >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) + >>> b = np.array([[0, 0, 0], [0, 0, 1]]) + >>> similarity_search(a, b) + [[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]] + """ + + if dataset.ndim != value.ndim: + raise TypeError( + "Wrong input data's dimensions... dataset : ", + dataset.ndim, + ", value : ", + value.ndim, + ) + + try: + if dataset.shape[1] != value.shape[1]: + raise TypeError( + "Wrong input data's shape... dataset : ", + dataset.shape[1], + ", value : ", + value.shape[1], + ) + except IndexError: + if (dataset.ndim == value.ndim) != 1: + raise TypeError("Wrong type") + + if dataset.dtype != value.dtype: + raise TypeError( + "Input datas have different datatype... dataset : ", + dataset.dtype, + ", value : ", + value.dtype, + ) + + answer = [] + + for index in range(len(value)): + dist = euclidean(value[index], dataset[0]) + vector = dataset[0].tolist() + + for index2 in range(1, len(dataset)): + temp_dist = euclidean(value[index], dataset[index2]) + + if dist > temp_dist: + dist = temp_dist + vector = dataset[index2].tolist() + + answer.append([vector, dist]) + + return answer + + +if __name__ == "__main__": + import doctest + + doctest.testmod() From 09caa3b580f08403b475396aeddcf818117a320d Mon Sep 17 00:00:00 2001 From: SteveKimSR Date: Fri, 6 Nov 2020 11:05:52 +0900 Subject: [PATCH 2/8] fix pre-commit test, apply feedback isort, codespell changed. applied feedback(np -> np.ndarray) --- machine_learning/similarity_search.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index db28e3090a53..9db964e9c924 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -6,16 +6,17 @@ 1. the nearest vector 2. distance between the vector and the nearest vector """ -import numpy as np import math +import numpy as np + def euclidean(input_a, input_b): """ Calculates euclidean distance between two data. The result should be float. >>> euclidean(0, 1) 1.0 - >>> euclidean(np.array([0, 1]),np.array([1, 1])) + >>> euclidean(np.array([0, 1]), np.array([1, 1])) 1.0 >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1])) 1.0 @@ -32,7 +33,7 @@ def euclidean(input_a, input_b): return None -def similarity_search(dataset: np, value: np) -> list: +def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list: """ :param dataset: Set containing the vectors. :param value: vector/vectors we want to know the nearest vector from dataset. @@ -79,7 +80,7 @@ def similarity_search(dataset: np, value: np) -> list: if dataset.dtype != value.dtype: raise TypeError( - "Input datas have different datatype... dataset : ", + "Input data have different datatype... dataset : ", dataset.dtype, ", value : ", value.dtype, From 7ce2cce7b272270a24875f7004518ab1587153b1 Mon Sep 17 00:00:00 2001 From: SteveKimSR Date: Fri, 6 Nov 2020 12:22:01 +0900 Subject: [PATCH 3/8] apply feedback add type hints to euclidean method --- machine_learning/similarity_search.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index 9db964e9c924..da10f3540509 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -7,11 +7,14 @@ 2. distance between the vector and the nearest vector """ import math +from typing import Union import numpy as np +InputVal = Union[int, float, np.ndarray] -def euclidean(input_a, input_b): + +def euclidean(input_a: InputVal, input_b: InputVal): """ Calculates euclidean distance between two data. The result should be float. >>> euclidean(0, 1) From f38fb3e415142fd03c02027c89c7308363816ce9 Mon Sep 17 00:00:00 2001 From: SteveKimSR Date: Sat, 7 Nov 2020 11:36:40 +0900 Subject: [PATCH 4/8] apply feedback - changed euclidean's type hints - changed few TypeError to ValueError - changed range(len()) to enumerate() - changed error's strings to f-string - implemented without type() - add euclidean's explanation --- machine_learning/similarity_search.py | 72 +++++++++++++-------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index da10f3540509..49bd365573b9 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -1,50 +1,55 @@ """ -Simularity search is a search algorithm for finding the nearest vector from +Similarity Search : https://en.wikipedia.org/wiki/Similarity_search +Similarity search is a search algorithm for finding the nearest vector from vectors, used in natural language processing. In this algorithm, it calculates distance with euclidean distance and returns a list containing two data for each vector: 1. the nearest vector - 2. distance between the vector and the nearest vector + 2. distance between the vector and the nearest vector (float) """ import math -from typing import Union import numpy as np -InputVal = Union[int, float, np.ndarray] - -def euclidean(input_a: InputVal, input_b: InputVal): +def euclidean(input_a: np.ndarray, input_b: np.ndarray): """ - Calculates euclidean distance between two data. The result should be float. - >>> euclidean(0, 1) + Calculates euclidean distance between two data. + :param input_a: ndarray of first vector. + :param input_b: ndarray of second vector. + :return: Euclidean distance of input_a and input_b. By using math.sqrt(), + result will be float. + + >>> euclidean(np.array([0]), np.array([1])) 1.0 >>> euclidean(np.array([0, 1]), np.array([1, 1])) 1.0 >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1])) 1.0 """ + dist = 0 - if type(input_a) == type(input_b): - if type(input_a) != np.ndarray: - dist = pow(input_a - input_b, 2) - else: - for index in range(len(input_a)): - dist += pow(input_a[index] - input_b[index], 2) + try: + for index, v in enumerate(input_a): + dist += pow(input_a[index] - input_b[index], 2) return math.sqrt(dist) - return None + except TypeError: + raise TypeError("Euclidean's input types are not right ...") def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list: """ - :param dataset: Set containing the vectors. + :param dataset: Set containing the vectors. Should be ndarray. :param value: vector/vectors we want to know the nearest vector from dataset. - Result will be a list containing 1. the nearest vector, 2. distance from the vector - >>> a = np.array([0, 1, 2]) - >>> b = np.array([0]) + :return: Result will be a list containing + 1. the nearest vector + 2. distance from the vector + + >>> a = np.array([[0], [1], [2]]) + >>> b = np.array([[0]]) >>> similarity_search(a, b) - [[0, 0.0]] + [[[0], 0.0]] >>> a = np.array([[0, 0], [1, 1], [2, 2]]) >>> b = np.array([[0, 1]]) @@ -55,6 +60,7 @@ def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list: >>> b = np.array([[0, 0, 1]]) >>> similarity_search(a, b) [[[0, 0, 0], 1.0]] + >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) >>> b = np.array([[0, 0, 0], [0, 0, 1]]) >>> similarity_search(a, b) @@ -62,36 +68,30 @@ def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list: """ if dataset.ndim != value.ndim: - raise TypeError( - "Wrong input data's dimensions... dataset : ", - dataset.ndim, - ", value : ", - value.ndim, + raise ValueError( + f"Wrong input data's dimensions... dataset : {dataset.ndim}, " + f"value: {value.ndim}" ) try: if dataset.shape[1] != value.shape[1]: - raise TypeError( - "Wrong input data's shape... dataset : ", - dataset.shape[1], - ", value : ", - value.shape[1], + raise ValueError( + f"Wrong input data's shape... dataset : {dataset.shape[1]}, " + f"value : {value.shape[1]}" ) except IndexError: - if (dataset.ndim == value.ndim) != 1: + if dataset.ndim != value.ndim: raise TypeError("Wrong type") if dataset.dtype != value.dtype: raise TypeError( - "Input data have different datatype... dataset : ", - dataset.dtype, - ", value : ", - value.dtype, + f"Input data have different datatype... dataset : {dataset.dtype}, " + f"value : {value.dtype}" ) answer = [] - for index in range(len(value)): + for index, v in enumerate(value): dist = euclidean(value[index], dataset[0]) vector = dataset[0].tolist() From ebfe05a7d60d025d9176926f8cc38fbf386a8f90 Mon Sep 17 00:00:00 2001 From: SteveKimSR Date: Fri, 13 Nov 2020 18:31:54 +0900 Subject: [PATCH 5/8] apply feedback - deleted try/catch in euclidean - added error tests - name change(value -> value_array) --- machine_learning/similarity_search.py | 96 +++++++++++++++++---------- 1 file changed, 62 insertions(+), 34 deletions(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index 49bd365573b9..c5627ffd827f 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -12,7 +12,7 @@ import numpy as np -def euclidean(input_a: np.ndarray, input_b: np.ndarray): +def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float: """ Calculates euclidean distance between two data. :param input_a: ndarray of first vector. @@ -30,77 +30,105 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray): dist = 0 - try: - for index, v in enumerate(input_a): - dist += pow(input_a[index] - input_b[index], 2) - return math.sqrt(dist) - except TypeError: - raise TypeError("Euclidean's input types are not right ...") + for a, b in zip(input_a, input_b): + dist += pow(a - b, 2) + return math.sqrt(dist) -def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list: +def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list: """ :param dataset: Set containing the vectors. Should be ndarray. - :param value: vector/vectors we want to know the nearest vector from dataset. + :param value_array: vector/vectors we want to know the nearest vector from dataset. :return: Result will be a list containing 1. the nearest vector 2. distance from the vector - >>> a = np.array([[0], [1], [2]]) - >>> b = np.array([[0]]) - >>> similarity_search(a, b) + >>> dataset = np.array([[0], [1], [2]]) + >>> value_array = np.array([[0]]) + >>> similarity_search(dataset, value_array) [[[0], 0.0]] - >>> a = np.array([[0, 0], [1, 1], [2, 2]]) - >>> b = np.array([[0, 1]]) - >>> similarity_search(a, b) + >>> dataset = np.array([[0, 0], [1, 1], [2, 2]]) + >>> value_array = np.array([[0, 1]]) + >>> similarity_search(dataset, value_array) [[[0, 0], 1.0]] - >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) - >>> b = np.array([[0, 0, 1]]) - >>> similarity_search(a, b) + >>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) + >>> value_array = np.array([[0, 0, 1]]) + >>> similarity_search(dataset, value_array) [[[0, 0, 0], 1.0]] - >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) - >>> b = np.array([[0, 0, 0], [0, 0, 1]]) - >>> similarity_search(a, b) + >>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) + >>> value_array = np.array([[0, 0, 0], [0, 0, 1]]) + >>> similarity_search(dataset, value_array) [[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]] + + These are the errors that might occur: + + 1. If dimensions are different. + For example, dataset has 2d array and value_array has 1d array: + >>> dataset = np.array([[1]]) + >>> value_array = np.array([1]) + >>> similarity_search(dataset, value_array) + Traceback (most recent call last): + ... + ValueError: Wrong input data's dimensions... dataset : 2, value_array : 1 + + 2. If data's shapes are different. + For example, dataset has shape of (3, 2) and value_array has (2, 3). + We are expecting same shapes of two arrays, so it is wrong. + >>> dataset = np.array([[0, 0], [1, 1], [2, 2]]) + >>> value_array = np.array([[0, 0, 0], [0, 0, 1]]) + >>> similarity_search(dataset, value_array) + Traceback (most recent call last): + ... + ValueError: Wrong input data's shape... dataset : 2, value_array : 3 + + 3. If data types are different. + When trying to compare, we are expecting same types so they should be same. + If not, it'll come up with errors. + >>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32) + >>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32) + >>> similarity_search(dataset, value_array) + Traceback (most recent call last): + ... + TypeError: Input data have different datatype... dataset : float32, value_array : int32 """ - if dataset.ndim != value.ndim: + if dataset.ndim != value_array.ndim: raise ValueError( f"Wrong input data's dimensions... dataset : {dataset.ndim}, " - f"value: {value.ndim}" + f"value_array : {value_array.ndim}" ) try: - if dataset.shape[1] != value.shape[1]: + if dataset.shape[1] != value_array.shape[1]: raise ValueError( f"Wrong input data's shape... dataset : {dataset.shape[1]}, " - f"value : {value.shape[1]}" + f"value_array : {value_array.shape[1]}" ) except IndexError: - if dataset.ndim != value.ndim: - raise TypeError("Wrong type") + if dataset.ndim != value_array.ndim: + raise TypeError("Wrong shape") - if dataset.dtype != value.dtype: + if dataset.dtype != value_array.dtype: raise TypeError( f"Input data have different datatype... dataset : {dataset.dtype}, " - f"value : {value.dtype}" + f"value_array : {value_array.dtype}" ) answer = [] - for index, v in enumerate(value): - dist = euclidean(value[index], dataset[0]) + for value in value_array: + dist = euclidean(value, dataset[0]) vector = dataset[0].tolist() - for index2 in range(1, len(dataset)): - temp_dist = euclidean(value[index], dataset[index2]) + for dataset_value in dataset[1:]: + temp_dist = euclidean(value, dataset_value) if dist > temp_dist: dist = temp_dist - vector = dataset[index2].tolist() + vector = dataset_value.tolist() answer.append([vector, dist]) From 2d2c6b82cacdf6b160bb97c3e36dfccf0b6d5d21 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 13 Nov 2020 15:17:28 +0100 Subject: [PATCH 6/8] # doctest: +NORMALIZE_WHITESPACE --- machine_learning/similarity_search.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index c5627ffd827f..e2e60b905df1 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -89,10 +89,11 @@ def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list: If not, it'll come up with errors. >>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32) >>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32) - >>> similarity_search(dataset, value_array) + >>> similarity_search(dataset, value_array) # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - TypeError: Input data have different datatype... dataset : float32, value_array : int32 + TypeError: Input data have different datatype... + dataset : float32, value_array : int32 """ if dataset.ndim != value_array.ndim: From 9637de7092ffbd0779520741ab7c3e26612d674b Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 13 Nov 2020 15:17:49 +0100 Subject: [PATCH 7/8] Update machine_learning/similarity_search.py --- machine_learning/similarity_search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index e2e60b905df1..2b402970d6d4 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -30,9 +30,7 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float: dist = 0 - for a, b in zip(input_a, input_b): - dist += pow(a - b, 2) - return math.sqrt(dist) + return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b))) def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list: From 6f7c9ce4f342ed56f9db4e1196b000e0e808561b Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 13 Nov 2020 15:20:37 +0100 Subject: [PATCH 8/8] placate flake8 --- machine_learning/similarity_search.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index 2b402970d6d4..6bfb12ed88cb 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -27,10 +27,7 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float: >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1])) 1.0 """ - - dist = 0 - - return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b))) + return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b))) def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list: