From 40a65035da1cf824b98130fca7f5a265a2a1e350 Mon Sep 17 00:00:00 2001
From: SteveKimSR <kimsr96@naver.com>
Date: Thu, 5 Nov 2020 18:51:46 +0900
Subject: [PATCH 1/8] add similarity_search.py in machine_learning adding
 similarity_search algorithm in machine_learning

---
 machine_learning/similarity_search.py | 109 ++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 machine_learning/similarity_search.py

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
new file mode 100644
index 000000000000..db28e3090a53
--- /dev/null
+++ b/machine_learning/similarity_search.py
@@ -0,0 +1,109 @@
+"""
+Simularity search is a search algorithm for finding the nearest vector from
+vectors, used in natural language processing.
+In this algorithm, it calculates distance with euclidean distance and
+returns a list containing two data for each vector:
+    1. the nearest vector
+    2. distance between the vector and the nearest vector
+"""
+import numpy as np
+import math
+
+
+def euclidean(input_a, input_b):
+    """
+    Calculates euclidean distance between two data. The result should be float.
+    >>> euclidean(0, 1)
+    1.0
+    >>> euclidean(np.array([0, 1]),np.array([1, 1]))
+    1.0
+    >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
+    1.0
+    """
+    dist = 0
+
+    if type(input_a) == type(input_b):
+        if type(input_a) != np.ndarray:
+            dist = pow(input_a - input_b, 2)
+        else:
+            for index in range(len(input_a)):
+                dist += pow(input_a[index] - input_b[index], 2)
+        return math.sqrt(dist)
+    return None
+
+
+def similarity_search(dataset: np, value: np) -> list:
+    """
+    :param dataset: Set containing the vectors.
+    :param value: vector/vectors we want to know the nearest vector from dataset.
+    Result will be a list containing 1. the nearest vector, 2. distance from the vector
+    >>> a = np.array([0, 1, 2])
+    >>> b = np.array([0])
+    >>> similarity_search(a, b)
+    [[0, 0.0]]
+
+    >>> a = np.array([[0, 0], [1, 1], [2, 2]])
+    >>> b = np.array([[0, 1]])
+    >>> similarity_search(a, b)
+    [[[0, 0], 1.0]]
+
+    >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
+    >>> b = np.array([[0, 0, 1]])
+    >>> similarity_search(a, b)
+    [[[0, 0, 0], 1.0]]
+    >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
+    >>> b = np.array([[0, 0, 0], [0, 0, 1]])
+    >>> similarity_search(a, b)
+    [[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
+    """
+
+    if dataset.ndim != value.ndim:
+        raise TypeError(
+            "Wrong input data's dimensions... dataset : ",
+            dataset.ndim,
+            ", value : ",
+            value.ndim,
+        )
+
+    try:
+        if dataset.shape[1] != value.shape[1]:
+            raise TypeError(
+                "Wrong input data's shape... dataset : ",
+                dataset.shape[1],
+                ", value : ",
+                value.shape[1],
+            )
+    except IndexError:
+        if (dataset.ndim == value.ndim) != 1:
+            raise TypeError("Wrong type")
+
+    if dataset.dtype != value.dtype:
+        raise TypeError(
+            "Input datas have different datatype... dataset : ",
+            dataset.dtype,
+            ", value : ",
+            value.dtype,
+        )
+
+    answer = []
+
+    for index in range(len(value)):
+        dist = euclidean(value[index], dataset[0])
+        vector = dataset[0].tolist()
+
+        for index2 in range(1, len(dataset)):
+            temp_dist = euclidean(value[index], dataset[index2])
+
+            if dist > temp_dist:
+                dist = temp_dist
+                vector = dataset[index2].tolist()
+
+        answer.append([vector, dist])
+
+    return answer
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()

From 09caa3b580f08403b475396aeddcf818117a320d Mon Sep 17 00:00:00 2001
From: SteveKimSR <kimsr96@naver.com>
Date: Fri, 6 Nov 2020 11:05:52 +0900
Subject: [PATCH 2/8] fix pre-commit test, apply feedback

isort, codespell changed.
applied feedback(np -> np.ndarray)
---
 machine_learning/similarity_search.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index db28e3090a53..9db964e9c924 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -6,16 +6,17 @@
     1. the nearest vector
     2. distance between the vector and the nearest vector
 """
-import numpy as np
 import math
 
+import numpy as np
+
 
 def euclidean(input_a, input_b):
     """
     Calculates euclidean distance between two data. The result should be float.
     >>> euclidean(0, 1)
     1.0
-    >>> euclidean(np.array([0, 1]),np.array([1, 1]))
+    >>> euclidean(np.array([0, 1]), np.array([1, 1]))
     1.0
     >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
     1.0
@@ -32,7 +33,7 @@ def euclidean(input_a, input_b):
     return None
 
 
-def similarity_search(dataset: np, value: np) -> list:
+def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list:
     """
     :param dataset: Set containing the vectors.
     :param value: vector/vectors we want to know the nearest vector from dataset.
@@ -79,7 +80,7 @@ def similarity_search(dataset: np, value: np) -> list:
 
     if dataset.dtype != value.dtype:
         raise TypeError(
-            "Input datas have different datatype... dataset : ",
+            "Input data have different datatype... dataset : ",
             dataset.dtype,
             ", value : ",
             value.dtype,

From 7ce2cce7b272270a24875f7004518ab1587153b1 Mon Sep 17 00:00:00 2001
From: SteveKimSR <kimsr96@naver.com>
Date: Fri, 6 Nov 2020 12:22:01 +0900
Subject: [PATCH 3/8] apply feedback

add type hints to euclidean method
---
 machine_learning/similarity_search.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index 9db964e9c924..da10f3540509 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -7,11 +7,14 @@
     2. distance between the vector and the nearest vector
 """
 import math
+from typing import Union
 
 import numpy as np
 
+InputVal = Union[int, float, np.ndarray]
 
-def euclidean(input_a, input_b):
+
+def euclidean(input_a: InputVal, input_b: InputVal):
     """
     Calculates euclidean distance between two data. The result should be float.
     >>> euclidean(0, 1)

From f38fb3e415142fd03c02027c89c7308363816ce9 Mon Sep 17 00:00:00 2001
From: SteveKimSR <kimsr96@naver.com>
Date: Sat, 7 Nov 2020 11:36:40 +0900
Subject: [PATCH 4/8] apply feedback

- changed euclidean's type hints
- changed few TypeError to ValueError
- changed range(len()) to enumerate()
- changed error's strings to f-string
- implemented without type()
- add euclidean's explanation
---
 machine_learning/similarity_search.py | 72 +++++++++++++--------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index da10f3540509..49bd365573b9 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -1,50 +1,55 @@
 """
-Simularity search is a search algorithm for finding the nearest vector from
+Similarity Search : https://en.wikipedia.org/wiki/Similarity_search
+Similarity search is a search algorithm for finding the nearest vector from
 vectors, used in natural language processing.
 In this algorithm, it calculates distance with euclidean distance and
 returns a list containing two data for each vector:
     1. the nearest vector
-    2. distance between the vector and the nearest vector
+    2. distance between the vector and the nearest vector (float)
 """
 import math
-from typing import Union
 
 import numpy as np
 
-InputVal = Union[int, float, np.ndarray]
 
-
-def euclidean(input_a: InputVal, input_b: InputVal):
+def euclidean(input_a: np.ndarray, input_b: np.ndarray):
     """
-    Calculates euclidean distance between two data. The result should be float.
-    >>> euclidean(0, 1)
+    Calculates euclidean distance between two data.
+    :param input_a: ndarray of first vector.
+    :param input_b: ndarray of second vector.
+    :return: Euclidean distance of input_a and input_b. By using math.sqrt(),
+             result will be float.
+
+    >>> euclidean(np.array([0]), np.array([1]))
     1.0
     >>> euclidean(np.array([0, 1]), np.array([1, 1]))
     1.0
     >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
     1.0
     """
+
     dist = 0
 
-    if type(input_a) == type(input_b):
-        if type(input_a) != np.ndarray:
-            dist = pow(input_a - input_b, 2)
-        else:
-            for index in range(len(input_a)):
-                dist += pow(input_a[index] - input_b[index], 2)
+    try:
+        for index, v in enumerate(input_a):
+            dist += pow(input_a[index] - input_b[index], 2)
         return math.sqrt(dist)
-    return None
+    except TypeError:
+        raise TypeError("Euclidean's input types are not right ...")
 
 
 def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list:
     """
-    :param dataset: Set containing the vectors.
+    :param dataset: Set containing the vectors. Should be ndarray.
     :param value: vector/vectors we want to know the nearest vector from dataset.
-    Result will be a list containing 1. the nearest vector, 2. distance from the vector
-    >>> a = np.array([0, 1, 2])
-    >>> b = np.array([0])
+    :return: Result will be a list containing
+            1. the nearest vector
+            2. distance from the vector
+
+    >>> a = np.array([[0], [1], [2]])
+    >>> b = np.array([[0]])
     >>> similarity_search(a, b)
-    [[0, 0.0]]
+    [[[0], 0.0]]
 
     >>> a = np.array([[0, 0], [1, 1], [2, 2]])
     >>> b = np.array([[0, 1]])
@@ -55,6 +60,7 @@ def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list:
     >>> b = np.array([[0, 0, 1]])
     >>> similarity_search(a, b)
     [[[0, 0, 0], 1.0]]
+
     >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
     >>> b = np.array([[0, 0, 0], [0, 0, 1]])
     >>> similarity_search(a, b)
@@ -62,36 +68,30 @@ def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list:
     """
 
     if dataset.ndim != value.ndim:
-        raise TypeError(
-            "Wrong input data's dimensions... dataset : ",
-            dataset.ndim,
-            ", value : ",
-            value.ndim,
+        raise ValueError(
+            f"Wrong input data's dimensions... dataset : {dataset.ndim}, "
+            f"value: {value.ndim}"
         )
 
     try:
         if dataset.shape[1] != value.shape[1]:
-            raise TypeError(
-                "Wrong input data's shape... dataset : ",
-                dataset.shape[1],
-                ", value : ",
-                value.shape[1],
+            raise ValueError(
+                f"Wrong input data's shape... dataset : {dataset.shape[1]}, "
+                f"value : {value.shape[1]}"
             )
     except IndexError:
-        if (dataset.ndim == value.ndim) != 1:
+        if dataset.ndim != value.ndim:
             raise TypeError("Wrong type")
 
     if dataset.dtype != value.dtype:
         raise TypeError(
-            "Input data have different datatype... dataset : ",
-            dataset.dtype,
-            ", value : ",
-            value.dtype,
+            f"Input data have different datatype... dataset : {dataset.dtype}, "
+            f"value : {value.dtype}"
         )
 
     answer = []
 
-    for index in range(len(value)):
+    for index, v in enumerate(value):
         dist = euclidean(value[index], dataset[0])
         vector = dataset[0].tolist()
 

From ebfe05a7d60d025d9176926f8cc38fbf386a8f90 Mon Sep 17 00:00:00 2001
From: SteveKimSR <kimsr96@naver.com>
Date: Fri, 13 Nov 2020 18:31:54 +0900
Subject: [PATCH 5/8] apply feedback

- deleted try/catch in euclidean
- added error tests
- name change(value -> value_array)
---
 machine_learning/similarity_search.py | 96 +++++++++++++++++----------
 1 file changed, 62 insertions(+), 34 deletions(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index 49bd365573b9..c5627ffd827f 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -12,7 +12,7 @@
 import numpy as np
 
 
-def euclidean(input_a: np.ndarray, input_b: np.ndarray):
+def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
     """
     Calculates euclidean distance between two data.
     :param input_a: ndarray of first vector.
@@ -30,77 +30,105 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray):
 
     dist = 0
 
-    try:
-        for index, v in enumerate(input_a):
-            dist += pow(input_a[index] - input_b[index], 2)
-        return math.sqrt(dist)
-    except TypeError:
-        raise TypeError("Euclidean's input types are not right ...")
+    for a, b in zip(input_a, input_b):
+        dist += pow(a - b, 2)
+    return math.sqrt(dist)
 
 
-def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list:
+def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list:
     """
     :param dataset: Set containing the vectors. Should be ndarray.
-    :param value: vector/vectors we want to know the nearest vector from dataset.
+    :param value_array: vector/vectors we want to know the nearest vector from dataset.
     :return: Result will be a list containing
             1. the nearest vector
             2. distance from the vector
 
-    >>> a = np.array([[0], [1], [2]])
-    >>> b = np.array([[0]])
-    >>> similarity_search(a, b)
+    >>> dataset = np.array([[0], [1], [2]])
+    >>> value_array = np.array([[0]])
+    >>> similarity_search(dataset, value_array)
     [[[0], 0.0]]
 
-    >>> a = np.array([[0, 0], [1, 1], [2, 2]])
-    >>> b = np.array([[0, 1]])
-    >>> similarity_search(a, b)
+    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
+    >>> value_array = np.array([[0, 1]])
+    >>> similarity_search(dataset, value_array)
     [[[0, 0], 1.0]]
 
-    >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
-    >>> b = np.array([[0, 0, 1]])
-    >>> similarity_search(a, b)
+    >>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
+    >>> value_array = np.array([[0, 0, 1]])
+    >>> similarity_search(dataset, value_array)
     [[[0, 0, 0], 1.0]]
 
-    >>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
-    >>> b = np.array([[0, 0, 0], [0, 0, 1]])
-    >>> similarity_search(a, b)
+    >>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
+    >>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
+    >>> similarity_search(dataset, value_array)
     [[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
+
+    These are the errors that might occur:
+
+    1. If dimensions are different.
+    For example, dataset has 2d array and value_array has 1d array:
+    >>> dataset = np.array([[1]])
+    >>> value_array = np.array([1])
+    >>> similarity_search(dataset, value_array)
+    Traceback (most recent call last):
+    ...
+    ValueError: Wrong input data's dimensions... dataset : 2, value_array : 1
+
+    2. If data's shapes are different.
+    For example, dataset has shape of (3, 2) and value_array has (2, 3).
+    We are expecting same shapes of two arrays, so it is wrong.
+    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
+    >>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
+    >>> similarity_search(dataset, value_array)
+    Traceback (most recent call last):
+    ...
+    ValueError: Wrong input data's shape... dataset : 2, value_array : 3
+
+    3. If data types are different.
+    When trying to compare, we are expecting same types so they should be same.
+    If not, it'll come up with errors.
+    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32)
+    >>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32)
+    >>> similarity_search(dataset, value_array)
+    Traceback (most recent call last):
+    ...
+    TypeError: Input data have different datatype... dataset : float32, value_array : int32
     """
 
-    if dataset.ndim != value.ndim:
+    if dataset.ndim != value_array.ndim:
         raise ValueError(
             f"Wrong input data's dimensions... dataset : {dataset.ndim}, "
-            f"value: {value.ndim}"
+            f"value_array : {value_array.ndim}"
         )
 
     try:
-        if dataset.shape[1] != value.shape[1]:
+        if dataset.shape[1] != value_array.shape[1]:
             raise ValueError(
                 f"Wrong input data's shape... dataset : {dataset.shape[1]}, "
-                f"value : {value.shape[1]}"
+                f"value_array : {value_array.shape[1]}"
             )
     except IndexError:
-        if dataset.ndim != value.ndim:
-            raise TypeError("Wrong type")
+        if dataset.ndim != value_array.ndim:
+            raise TypeError("Wrong shape")
 
-    if dataset.dtype != value.dtype:
+    if dataset.dtype != value_array.dtype:
         raise TypeError(
             f"Input data have different datatype... dataset : {dataset.dtype}, "
-            f"value : {value.dtype}"
+            f"value_array : {value_array.dtype}"
         )
 
     answer = []
 
-    for index, v in enumerate(value):
-        dist = euclidean(value[index], dataset[0])
+    for value in value_array:
+        dist = euclidean(value, dataset[0])
         vector = dataset[0].tolist()
 
-        for index2 in range(1, len(dataset)):
-            temp_dist = euclidean(value[index], dataset[index2])
+        for dataset_value in dataset[1:]:
+            temp_dist = euclidean(value, dataset_value)
 
             if dist > temp_dist:
                 dist = temp_dist
-                vector = dataset[index2].tolist()
+                vector = dataset_value.tolist()
 
         answer.append([vector, dist])
 

From 2d2c6b82cacdf6b160bb97c3e36dfccf0b6d5d21 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Fri, 13 Nov 2020 15:17:28 +0100
Subject: [PATCH 6/8] # doctest: +NORMALIZE_WHITESPACE

---
 machine_learning/similarity_search.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index c5627ffd827f..e2e60b905df1 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -89,10 +89,11 @@ def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list:
     If not, it'll come up with errors.
     >>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32)
     >>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32)
-    >>> similarity_search(dataset, value_array)
+    >>> similarity_search(dataset, value_array)  # doctest: +NORMALIZE_WHITESPACE
     Traceback (most recent call last):
     ...
-    TypeError: Input data have different datatype... dataset : float32, value_array : int32
+    TypeError: Input data have different datatype...
+    dataset : float32, value_array : int32
     """
 
     if dataset.ndim != value_array.ndim:

From 9637de7092ffbd0779520741ab7c3e26612d674b Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Fri, 13 Nov 2020 15:17:49 +0100
Subject: [PATCH 7/8] Update machine_learning/similarity_search.py

---
 machine_learning/similarity_search.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index e2e60b905df1..2b402970d6d4 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -30,9 +30,7 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
 
     dist = 0
 
-    for a, b in zip(input_a, input_b):
-        dist += pow(a - b, 2)
-    return math.sqrt(dist)
+    return math.sqrt(sum(pow(a - b, 2)  for a, b in zip(input_a, input_b)))
 
 
 def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list:

From 6f7c9ce4f342ed56f9db4e1196b000e0e808561b Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Fri, 13 Nov 2020 15:20:37 +0100
Subject: [PATCH 8/8] placate flake8

---
 machine_learning/similarity_search.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index 2b402970d6d4..6bfb12ed88cb 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -27,10 +27,7 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
     >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
     1.0
     """
-
-    dist = 0
-
-    return math.sqrt(sum(pow(a - b, 2)  for a, b in zip(input_a, input_b)))
+    return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))
 
 
 def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list: