From 9909e3a24d81a0fe423619d37b97a4bf0e2a153e Mon Sep 17 00:00:00 2001
From: yyliu <yyliu@dmo-sys.com>
Date: Fri, 12 Oct 2018 19:53:25 +0800
Subject: [PATCH 1/4] feat:

---
 .idea/hydra.xml                         |  9 ++++
 .idea/misc.xml                          |  4 ++
 .idea/sbt.xml                           |  7 +++
 .idea/vcs.xml                           |  6 +++
 Classification/DataMining_KNN/KNN.py    | 62 +++++++++++++++++++++++++
 Classification/DataMining_KNN/KNN.scala |  0
 6 files changed, 88 insertions(+)
 create mode 100644 .idea/hydra.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/sbt.xml
 create mode 100644 .idea/vcs.xml
 create mode 100644 Classification/DataMining_KNN/KNN.py
 create mode 100644 Classification/DataMining_KNN/KNN.scala
diff --git a/.idea/hydra.xml b/.idea/hydra.xml
new file mode 100644
index 0000000..123e89c
--- /dev/null
+++ b/.idea/hydra.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="HydraSettings">
+    <option name="hydraStorePath" value="$PROJECT_DIR$/.hydra/idea" />
+    <option name="noOfCores" value="4" />
+    <option name="projectRoot" value="$PROJECT_DIR$" />
+    <option name="sourcePartitioner" value="auto" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..99ae653
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="1.8" project-jdk-type="JavaSDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/sbt.xml b/.idea/sbt.xml
new file mode 100644
index 0000000..45cd6b3
--- /dev/null
+++ b/.idea/sbt.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ScalaSbtSettings">
+    <option name="customVMEnabled" value="true" />
+    <option name="customVMPath" value="/usr/lib/jvm/jdk1.8.0_171-amd64" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/Classification/DataMining_KNN/KNN.py b/Classification/DataMining_KNN/KNN.py
new file mode 100644
index 0000000..b9c17f7
--- /dev/null
+++ b/Classification/DataMining_KNN/KNN.py
@@ -0,0 +1,62 @@
+#!/usr/local/bin/python3
+
+'''
+ ********************************************
+ * Description : 
+ * Date        : 2018-10-12
+ * Author      : liuyy
+ * E-mail      : yyliu@dmo-sys.com
+ ********************************************
+'''
+import math
+
+def readfile(f, hasType = False):
+    res = []
+    
+    with open(f) as fp:
+        line = fp.readline()
+
+        while line:
+            line = line.rstrip()
+            tmp = line.split()
+
+            if hasType:
+                res.append((tmp[0], [int(i) for i in tmp[1:]]))
+            else:
+                res.append([int(i) for i in tmp])
+            
+            line = fp.readline()
+    return res
+
+trained=readfile("trainInput.txt", hasType = True)
+tested=readfile("testInput.txt")
+print(tested)
+print(trained)
+
+def classify(case, trained):
+    def dist(i, j):
+        s = 0
+        for v in zip(i, j):
+            s += pow(v[0] - v[1], 2)
+
+        return math.sqrt(s)
+
+    return [(i[0], dist(case, i[1])) for i in trained]
+
+def get_class(first_k):
+    res = {}
+    for i in first_k:
+        if i[0] not in res:
+            res[i[0]] = 0
+        res[i[0]] += 1
+    
+    return sorted(res.items(), key = lambda x: x[1], reverse = True)[0]
+
+for case in tested:
+    ct = classify(case, trained)
+    print(ct)
+    first_k = sorted(ct, key = lambda i: i[1])[:3]
+    print(str(case) + " " + str(get_class(first_k)))
+
+
+
diff --git a/Classification/DataMining_KNN/KNN.scala b/Classification/DataMining_KNN/KNN.scala
new file mode 100644
index 0000000..e69de29

From 459f59a2542da1347679d63ba39feb86409ef521 Mon Sep 17 00:00:00 2001
From: yyliu <yyliu@dmo-sys.com>
Date: Mon, 15 Oct 2018 11:01:30 +0800
Subject: [PATCH 2/4] feat(add python version of NaiveBayes):

---
 Classification/DataMining_NaiveBayes/NB.py | 64 ++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 Classification/DataMining_NaiveBayes/NB.py

diff --git a/Classification/DataMining_NaiveBayes/NB.py b/Classification/DataMining_NaiveBayes/NB.py
new file mode 100644
index 0000000..912ea60
--- /dev/null
+++ b/Classification/DataMining_NaiveBayes/NB.py
@@ -0,0 +1,64 @@
+#!/usr/local/bin/python3
+
+'''
+ ********************************************
+ * Description : 
+ * Date        : 2018-10-15
+ * Author      : liuyy
+ * E-mail      : yyliu@dmo-sys.com
+ ********************************************
+'''
+
+def initprob(f):
+    pre_pro = []
+    classtypes = {}
+
+    with open(f) as fp:
+        line = fp.readline()
+        line = fp.readline()
+
+        while len(line) != 0:
+            tmp = line.rstrip().split()[1:]
+
+            classtype = tmp[-1]
+            if classtype not in classtypes:
+                classtypes[classtype] = 0
+            classtypes[classtype] += 1
+
+            tmp = tmp[:-1]
+            if len(pre_pro) == 0:
+                pre_pro = [{} for i in range(len(tmp))]
+
+            for i in range(len(tmp)):
+                attr = tmp[i]
+
+                k = (attr, classtype)
+                if k not in pre_pro[i]:
+                    pre_pro[i][k] = 0
+                pre_pro[i][k] += 1
+    
+            line = fp.readline()
+    print(classtypes)
+    res = [{k: v/classtypes[k[1]] for k, v in i.items()} for i in pre_pro]
+
+    return (classtypes, res)
+
+(cts, pre_prob) = initprob("input.txt")
+for i in pre_prob:
+    print(i)
+
+def get_test(to_test, cts, pre_prob):
+    def _get_test(tmp, pre_prob, ct):
+        m = 1.0
+        for i in range(len(tmp)):
+            m *= pre_prob[i].get((tmp[i], ct), 1.0)
+
+        return m
+    
+    values = to_test.split()
+    re = [(_get_test(values, pre_prob, i), i) for i in cts.keys()]
+    return max(re, key = lambda x: x[0])[1]
+
+to_test = "Youth Medium Yes Fair"
+print(to_test, end = ": ")
+print(get_test(to_test, cts, pre_prob))

From 66dafade444afd3235aaf047cb5e2e813fb21c91 Mon Sep 17 00:00:00 2001
From: yyliu <yyliu@dmo-sys.com>
Date: Mon, 15 Oct 2018 11:01:41 +0800
Subject: [PATCH 3/4] feat:

---
 .../DataMining_NaiveBayes/input.txt           | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/Classification/DataMining_NaiveBayes/input.txt b/Classification/DataMining_NaiveBayes/input.txt
index b5940a5..bc5286f 100644
--- a/Classification/DataMining_NaiveBayes/input.txt
+++ b/Classification/DataMining_NaiveBayes/input.txt
@@ -1,15 +1,15 @@
-Day OutLook Temperature Humidity Wind PlayTennis
-1 Sunny Hot High Weak No
-2 Sunny Hot High Strong No
-3 Overcast Hot High Weak Yes
-4 Rainy Mild High Weak Yes
-5 Rainy Cool Normal Weak Yes
-6 Rainy Cool Normal Strong No
-7 Overcast Cool Normal Strong Yes
-8 Sunny Mild High Weak No
-9 Sunny Cool Normal Weak Yes
-10 Rainy Mild Normal Weak Yes
-11 Sunny Mild Normal Strong Yes
-12 Overcast Mild High Strong Yes
-13 Overcast Hot Normal Weak Yes
-14 Rainy Mild High Strong No
\ No newline at end of file
+Rid Age Income Student CreditRating BuysComputer
+1 Youth High No Fair No
+2 Youth High No Excellent No
+3 MiddleAged High No Fair Yes
+4 Senior Medium No Fair Yes
+5 Senior Low Yes Fair Yes
+6 Senior Low Yes Excellent No
+7 MiddleAged Low Yes Excellent Yes
+8 Youth Medium No Fair No
+9 Youth Low Yes Fair Yes
+10 Senior Medium Yes Fair Yes
+11 Youth Medium Yes Excellent Yes
+12 MiddleAged Medium No Excellent Yes
+13 MiddleAged High Yes Fair Yes
+14 Senior Medium No Excellent No

From b1ffe6e73e4089411ade63ef0b7322362813f931 Mon Sep 17 00:00:00 2001
From: yyliu <yyliu@dmo-sys.com>
Date: Tue, 16 Oct 2018 17:11:52 +0800
Subject: [PATCH 4/4] feat(KMeans code): add Kemans implementation

---
 Clustering/DataMining_KMeans/KMeans.py | 61 ++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 Clustering/DataMining_KMeans/KMeans.py

diff --git a/Clustering/DataMining_KMeans/KMeans.py b/Clustering/DataMining_KMeans/KMeans.py
new file mode 100644
index 0000000..4ed3a39
--- /dev/null
+++ b/Clustering/DataMining_KMeans/KMeans.py
@@ -0,0 +1,61 @@
+#!/usr/local/bin/python3
+
+'''
+ ********************************************
+ * Description : 
+ * Date        : 2018-10-16
+ * Author      : liuyy
+ * E-mail      : yyliu@dmo-sys.com
+ ********************************************
+'''
+
+from functools import reduce
+
+def calc_center(l):
+    p = reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]), l)
+    return (p[0] / len(l), p[1] / len(l))
+
+def dist(a, b):
+    return (a[0] - b[0]) * (a[0] - b[0]) + (a[1] - b[1]) * (a[1] - b[1])
+
+def iterate(ps, points):
+    res = [[i[0]] for i in ps]
+    
+    for p in points:
+        min_dist = dist(p, res[0][0])
+        min_class = res[0]
+
+        for i in res:
+            d = dist(p, i[0])
+            if d < min_dist:
+                min_dist = d
+                min_class = i
+        
+        min_class.append(p)
+
+    return res
+
+def init(f):
+    points = []
+
+    with open(f) as fp:
+        line = fp.readline()
+
+        while len(line) != 0:
+            (x, y) = line.rstrip().split()
+            points.append((int(x), int(y)))
+
+            line = fp.readline()
+
+    return points
+
+first_ps = [[(0, 0)], [(10, 10)]]
+points = init("input.txt")
+res = []
+
+for i in range(10):
+    res = iterate(first_ps, points)
+    first_ps = [[calc_center(i[1:])] for i in res]
+
+
+print(res)