From 9909e3a24d81a0fe423619d37b97a4bf0e2a153e Mon Sep 17 00:00:00 2001 From: yyliu Date: Fri, 12 Oct 2018 19:53:25 +0800 Subject: [PATCH 1/4] feat: --- .idea/hydra.xml | 9 ++++ .idea/misc.xml | 4 ++ .idea/sbt.xml | 7 +++ .idea/vcs.xml | 6 +++ Classification/DataMining_KNN/KNN.py | 62 +++++++++++++++++++++++++ Classification/DataMining_KNN/KNN.scala | 0 6 files changed, 88 insertions(+) create mode 100644 .idea/hydra.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/sbt.xml create mode 100644 .idea/vcs.xml create mode 100644 Classification/DataMining_KNN/KNN.py create mode 100644 Classification/DataMining_KNN/KNN.scala diff --git a/.idea/hydra.xml b/.idea/hydra.xml new file mode 100644 index 0000000..123e89c --- /dev/null +++ b/.idea/hydra.xml @@ -0,0 +1,9 @@ + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..99ae653 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/sbt.xml b/.idea/sbt.xml new file mode 100644 index 0000000..45cd6b3 --- /dev/null +++ b/.idea/sbt.xml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Classification/DataMining_KNN/KNN.py b/Classification/DataMining_KNN/KNN.py new file mode 100644 index 0000000..b9c17f7 --- /dev/null +++ b/Classification/DataMining_KNN/KNN.py @@ -0,0 +1,62 @@ +#!/usr/local/bin/python3 + +''' + ******************************************** + * Description : + * Date : 2018-10-12 + * Author : liuyy + * E-mail : yyliu@dmo-sys.com + ******************************************** +''' +import math + +def readfile(f, hasType = False): + res = [] + + with open(f) as fp: + line = fp.readline() + + while line: + line = line.rstrip() + tmp = line.split() + + if hasType: + res.append((tmp[0], [int(i) for i in tmp[1:]])) + else: + res.append([int(i) for i in tmp]) + + line = fp.readline() + return res + +trained=readfile("trainInput.txt", hasType = True) +tested=readfile("testInput.txt") +print(tested) +print(trained) + +def classify(case, trained): + def dist(i, j): + s = 0 + for v in zip(i, j): + s += pow(v[0] - v[1], 2) + + return math.sqrt(s) + + return [(i[0], dist(case, i[1])) for i in trained] + +def get_class(first_k): + res = {} + for i in first_k: + if i[0] not in res: + res[i[0]] = 0 + res[i[0]] += 1 + + return sorted(res.items(), key = lambda x: x[1], reverse = True)[0] + +for case in tested: + ct = classify(case, trained) + print(ct) + first_k = sorted(ct, key = lambda i: i[1])[:3] + print(str(case) + " " + str(get_class(first_k))) + + + diff --git a/Classification/DataMining_KNN/KNN.scala b/Classification/DataMining_KNN/KNN.scala new file mode 100644 index 0000000..e69de29 From 459f59a2542da1347679d63ba39feb86409ef521 Mon Sep 17 00:00:00 2001 From: yyliu Date: Mon, 15 Oct 2018 11:01:30 +0800 Subject: [PATCH 2/4] feat(add python version of NaiveBayes): --- Classification/DataMining_NaiveBayes/NB.py | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 Classification/DataMining_NaiveBayes/NB.py diff --git a/Classification/DataMining_NaiveBayes/NB.py b/Classification/DataMining_NaiveBayes/NB.py new file mode 100644 index 0000000..912ea60 --- /dev/null +++ b/Classification/DataMining_NaiveBayes/NB.py @@ -0,0 +1,64 @@ +#!/usr/local/bin/python3 + +''' + ******************************************** + * Description : + * Date : 2018-10-15 + * Author : liuyy + * E-mail : yyliu@dmo-sys.com + ******************************************** +''' + +def initprob(f): + pre_pro = [] + classtypes = {} + + with open(f) as fp: + line = fp.readline() + line = fp.readline() + + while len(line) != 0: + tmp = line.rstrip().split()[1:] + + classtype = tmp[-1] + if classtype not in classtypes: + classtypes[classtype] = 0 + classtypes[classtype] += 1 + + tmp = tmp[:-1] + if len(pre_pro) == 0: + pre_pro = [{} for i in range(len(tmp))] + + for i in range(len(tmp)): + attr = tmp[i] + + k = (attr, classtype) + if k not in pre_pro[i]: + pre_pro[i][k] = 0 + pre_pro[i][k] += 1 + + line = fp.readline() + print(classtypes) + res = [{k: v/classtypes[k[1]] for k, v in i.items()} for i in pre_pro] + + return (classtypes, res) + +(cts, pre_prob) = initprob("input.txt") +for i in pre_prob: + print(i) + +def get_test(to_test, cts, pre_prob): + def _get_test(tmp, pre_prob, ct): + m = 1.0 + for i in range(len(tmp)): + m *= pre_prob[i].get((tmp[i], ct), 1.0) + + return m + + values = to_test.split() + re = [(_get_test(values, pre_prob, i), i) for i in cts.keys()] + return max(re, key = lambda x: x[0])[1] + +to_test = "Youth Medium Yes Fair" +print(to_test, end = ": ") +print(get_test(to_test, cts, pre_prob)) From 66dafade444afd3235aaf047cb5e2e813fb21c91 Mon Sep 17 00:00:00 2001 From: yyliu Date: Mon, 15 Oct 2018 11:01:41 +0800 Subject: [PATCH 3/4] feat: --- .../DataMining_NaiveBayes/input.txt | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Classification/DataMining_NaiveBayes/input.txt b/Classification/DataMining_NaiveBayes/input.txt index b5940a5..bc5286f 100644 --- a/Classification/DataMining_NaiveBayes/input.txt +++ b/Classification/DataMining_NaiveBayes/input.txt @@ -1,15 +1,15 @@ -Day OutLook Temperature Humidity Wind PlayTennis -1 Sunny Hot High Weak No -2 Sunny Hot High Strong No -3 Overcast Hot High Weak Yes -4 Rainy Mild High Weak Yes -5 Rainy Cool Normal Weak Yes -6 Rainy Cool Normal Strong No -7 Overcast Cool Normal Strong Yes -8 Sunny Mild High Weak No -9 Sunny Cool Normal Weak Yes -10 Rainy Mild Normal Weak Yes -11 Sunny Mild Normal Strong Yes -12 Overcast Mild High Strong Yes -13 Overcast Hot Normal Weak Yes -14 Rainy Mild High Strong No \ No newline at end of file +Rid Age Income Student CreditRating BuysComputer +1 Youth High No Fair No +2 Youth High No Excellent No +3 MiddleAged High No Fair Yes +4 Senior Medium No Fair Yes +5 Senior Low Yes Fair Yes +6 Senior Low Yes Excellent No +7 MiddleAged Low Yes Excellent Yes +8 Youth Medium No Fair No +9 Youth Low Yes Fair Yes +10 Senior Medium Yes Fair Yes +11 Youth Medium Yes Excellent Yes +12 MiddleAged Medium No Excellent Yes +13 MiddleAged High Yes Fair Yes +14 Senior Medium No Excellent No From b1ffe6e73e4089411ade63ef0b7322362813f931 Mon Sep 17 00:00:00 2001 From: yyliu Date: Tue, 16 Oct 2018 17:11:52 +0800 Subject: [PATCH 4/4] feat(KMeans code): add Kemans implementation --- Clustering/DataMining_KMeans/KMeans.py | 61 ++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 Clustering/DataMining_KMeans/KMeans.py diff --git a/Clustering/DataMining_KMeans/KMeans.py b/Clustering/DataMining_KMeans/KMeans.py new file mode 100644 index 0000000..4ed3a39 --- /dev/null +++ b/Clustering/DataMining_KMeans/KMeans.py @@ -0,0 +1,61 @@ +#!/usr/local/bin/python3 + +''' + ******************************************** + * Description : + * Date : 2018-10-16 + * Author : liuyy + * E-mail : yyliu@dmo-sys.com + ******************************************** +''' + +from functools import reduce + +def calc_center(l): + p = reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]), l) + return (p[0] / len(l), p[1] / len(l)) + +def dist(a, b): + return (a[0] - b[0]) * (a[0] - b[0]) + (a[1] - b[1]) * (a[1] - b[1]) + +def iterate(ps, points): + res = [[i[0]] for i in ps] + + for p in points: + min_dist = dist(p, res[0][0]) + min_class = res[0] + + for i in res: + d = dist(p, i[0]) + if d < min_dist: + min_dist = d + min_class = i + + min_class.append(p) + + return res + +def init(f): + points = [] + + with open(f) as fp: + line = fp.readline() + + while len(line) != 0: + (x, y) = line.rstrip().split() + points.append((int(x), int(y))) + + line = fp.readline() + + return points + +first_ps = [[(0, 0)], [(10, 10)]] +points = init("input.txt") +res = [] + +for i in range(10): + res = iterate(first_ps, points) + first_ps = [[calc_center(i[1:])] for i in res] + + +print(res)