From 25d15a06d0dd20807a9a2dfd9908a7dbc4450ab7 Mon Sep 17 00:00:00 2001
From: MRHC <1932405808@qq.com>
Date: Sun, 28 Jan 2018 15:59:59 +0800
Subject: [PATCH 01/17] DBSCAN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加 DBSCAN的一种实现
---
 DBSCAN/dbscan.py | 201 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 DBSCAN/dbscan.py

diff --git a/DBSCAN/dbscan.py b/DBSCAN/dbscan.py
new file mode 100644
index 0000000..3efc858
--- /dev/null
+++ b/DBSCAN/dbscan.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#           实现了DBSCAN算法
+
+__author__ = 'ZYC@BUPT'
+import jieba
+import os
+import sys
+import json
+jieba.load_userdict("newword.dict")
+sys.setdefaultencoding("utf-8")
+from sklearn import feature_extraction
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+import numpy as np
+import matplotlib.pyplot as plt
+import math
+import time
+UNCLASSIFIED = False
+NOISE = 0
+
+def Test2(rootDir):
+    for lists in os.listdir(rootDir):
+        path = os.path.join(rootDir, lists)
+       # print path.decode('gb2312')
+        if path.find(".txt")!=-1:
+            Participle(path)
+        if os.path.isdir(path):
+            Test2(path)
+
+def Participle(path):
+    try:
+        fp = open(path, "r")
+        ad = fp.readline().strip('\n')
+        na = fp.readline().strip('\n')
+        ti = fp.readline().strip('\n')#time
+        si = fp.readline().strip('\n')
+        cont = na+fp.read()
+        fp.close()
+    except IOError:
+        return 0
+
+    try:
+        insi = {}
+        insi['time'] = ti
+        print(ti)
+        insi['url'] = ad
+        insi['title'] = na
+        insi['site'] = si#decode("gb2312").encode("utf-8")
+        global fnum
+        global segcont
+        global doc
+        seg_list = jieba.lcut(cont, cut_all=False)
+        stline = ""
+        for word in seg_list:
+            if ((word in d) is False) and word != '\n':
+                stline = stline + " " + word
+        segcont.append(stline)
+        print (str(fnum) + " 分词")
+        doc[fnum] = insi
+        fnum = fnum + 1
+    except UnicodeError:
+        return 0
+
+def loadDataSet(splitChar=','):
+    dataSet = []
+    global  we
+    dataSet=we
+    del we
+    return dataSet
+
+def region_query(data, pointId, eps):
+    nPoints = data.shape[1]
+    seeds = []
+    for i in range(nPoints):
+        if eps_neighbor(data[:, pointId], data[:, i], eps):
+            seeds.append(i)
+    return seeds
+
+def tstore(clusters,clusterNum):#测试使用
+    global doc
+    fpath="./test_res/"
+    i=0
+    wr=[]
+    while i<=clusterNum:
+        path=fpath+str(i)+".txt"
+        fp=open(path,'w')
+        wr.append(fp)
+        i+=1
+    i=1
+    for cl in clusters:
+        enstr=""
+        enstr=doc[i]['title']+doc[i]['url']
+        wr[cl].write(enstr+'\n')
+        i+=1
+
+def expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts):
+    seeds = region_query(data, pointId, eps)
+    if len(seeds) < minPts: # 不满足minPts条件的为噪声点
+        clusterResult[pointId] = NOISE
+        return False
+    else:
+        clusterResult[pointId] = clusterId # 划分到该簇
+        for seedId in seeds:
+            clusterResult[seedId] = clusterId
+        while len(seeds) > 0: # 扩张
+            currentPoint = seeds[0]
+            queryResults = region_query(data, currentPoint, eps)
+            if len(queryResults) >= minPts:
+                for i in range(len(queryResults)):
+                    resultPoint = queryResults[i]
+                    if clusterResult[resultPoint] == UNCLASSIFIED:
+                        seeds.append(resultPoint)
+                        clusterResult[resultPoint] = clusterId
+                    elif clusterResult[resultPoint] == NOISE:
+                        clusterResult[resultPoint] = clusterId
+            seeds = seeds[1:]
+
+        return True
+
+def dbscan(data, eps, minPts):
+    clusterId = 1
+    nPoints = data.shape[1]
+    clusterResult = [UNCLASSIFIED] * nPoints
+    for pointId in range(nPoints):
+       # print "point :"+str(pointId)
+        point = data[:, pointId]
+        if clusterResult[pointId] == UNCLASSIFIED:
+            if expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts):
+                clusterId = clusterId + 1
+    return clusterResult, clusterId - 1
+
+
+def eps_neighbor(a, b, eps):
+    dis=math.sqrt(np.power(a - b, 2).sum())
+    print(dis)
+    return dis < eps
+
+def main():
+    dataSet = loadDataSet(splitChar=',')
+    dataSet = np.mat(dataSet).transpose()
+    # print(dataSet)
+    clusters, clusterNum = dbscan(dataSet, 1.37, 5)#################################
+    print("cluster Numbers = ", clusterNum)
+    # print(clusters)
+    #store(clusters, clusterNum)
+    tstore(clusters, clusterNum)
+
+
+def TFIDF():
+    global segcont
+    global weight
+    vectorizer = CountVectorizer()
+    transformer = TfidfTransformer()
+    tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont))
+    word = vectorizer.get_feature_names()  # 所有文本的关键字
+    weight = tfidf.toarray()  # 对应的tfidf矩阵
+    del segcont
+
+    seg = []
+    for i in range(len(weight)):
+        enstr = ""
+        for j in range(len(word)):
+            if weight[i][j] >= 0.1:#####################################
+                enstr = enstr + " " + word[j]
+        seg.append(enstr)
+
+    del weight
+    vec = CountVectorizer()
+    tra = TfidfTransformer()
+    tidf = tra.fit_transform(vec.fit_transform(seg))
+    wo = vec.get_feature_names()
+    we = tidf.toarray()
+
+    global we
+
+def dbs():
+    global fnum,doc,segcont,d
+    fnum = 1
+    segcont = []
+    doc = {}
+    stfp = open("stop.txt", "r")
+    stcont = stfp.read()
+    list_a = jieba.lcut(stcont, cut_all=False)
+    d = set([])
+    for li in list_a:
+        d.add(li)
+    stfp.close()
+    Test2('./sou1')
+    TFIDF()
+    start = time.clock()
+    main()
+    end = time.clock()
+    print('finish all in %s' % str(end - start))
+
+dbs()
+
+
+
+

From f148034deac0847253b92e283768a3c45b8483ae Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 27 Jan 2019 23:22:44 +0800
Subject: [PATCH 02/17] add doc2vector

---
 draft/contentBasedRec.py             |   5 +
 draft/data/artice1.txt               |   2 +
 draft/data/article2.txt              |   1 +
 draft/data/corpus_test/t1.txt        |   2 +
 draft/data/corpus_test/t2.txt        |   1 +
 draft/data/corpus_words/artice1.txt  |   1 +
 draft/data/corpus_words/article2.txt |   1 +
 draft/doc2vector.py                  | 203 +++++++++++++++++++++++++++
 8 files changed, 216 insertions(+)
 create mode 100644 draft/data/artice1.txt
 create mode 100644 draft/data/article2.txt
 create mode 100644 draft/data/corpus_test/t1.txt
 create mode 100644 draft/data/corpus_test/t2.txt
 create mode 100644 draft/data/corpus_words/artice1.txt
 create mode 100644 draft/data/corpus_words/article2.txt
 create mode 100644 draft/doc2vector.py

diff --git a/draft/contentBasedRec.py b/draft/contentBasedRec.py
index 5bf333a..55e7b1d 100644
--- a/draft/contentBasedRec.py
+++ b/draft/contentBasedRec.py
@@ -30,3 +30,8 @@
 from sklearn.metrics.pairwise import cosine_similarity
 cos = cosine_similarity(weight[0:1], weight)
 recommendations = cos[0].argsort()[-4:][::-1]
+
+
+a = list(weight[0:1][0])
+aaa = pd.DataFrame(weight)
+cosine_similarity(np.array([a]), aaa)
\ No newline at end of file
diff --git a/draft/data/artice1.txt b/draft/data/artice1.txt
new file mode 100644
index 0000000..9769e91
--- /dev/null
+++ b/draft/data/artice1.txt
@@ -0,0 +1,2 @@
+然而，即使上述模型对词向量进行平均处理，我们仍然忽略了单词之间的排列顺序对情感分析的影响。即上述的word2vec只是基于词的维度进行"语义分析"的，而并不具有上下文的"语义分析"能力。
+作为一个处理可变长度文本的总结性方法，Quoc Le 和 Tomas Mikolov 提出了 Doc2Vec方法。除了增加一个段落向量以外，这个方法几乎等同于 Word2Vec。和 Word2Vec 一样，该模型也存在两种方法：Distributed Memory(DM) 和 Distributed Bag of Words(DBOW)。DM 试图在给定上下文和段落向量的情况下预测单词的概率。在一个句子或者文档的训练过程中，段落 ID 保持不变，共享着同一个段落向量。DBOW 则在仅给定段落向量的情况下预测段落中一组随机单词的概率。
\ No newline at end of file
diff --git a/draft/data/article2.txt b/draft/data/article2.txt
new file mode 100644
index 0000000..3d264a8
--- /dev/null
+++ b/draft/data/article2.txt
@@ -0,0 +1 @@
+训练过程中新增了paragraph id，即训练语料中每个句子都有一个唯一的id。paragraph id和普通的word一样，也是先映射成一个向量，即paragraph vector。paragraph vector与word vector的维数虽一样，但是来自于两个不同的向量空间。在之后的计算里，paragraph vector和word vector累加或者连接起来，作为输出层softmax的输入。在一个句子或者文档的训练过程中，paragraph id保持不变，共享着同一个paragraph vector，相当于每次在预测单词的概率时，都利用了整个句子的语义。
\ No newline at end of file
diff --git a/draft/data/corpus_test/t1.txt b/draft/data/corpus_test/t1.txt
new file mode 100644
index 0000000..9769e91
--- /dev/null
+++ b/draft/data/corpus_test/t1.txt
@@ -0,0 +1,2 @@
+然而，即使上述模型对词向量进行平均处理，我们仍然忽略了单词之间的排列顺序对情感分析的影响。即上述的word2vec只是基于词的维度进行"语义分析"的，而并不具有上下文的"语义分析"能力。
+作为一个处理可变长度文本的总结性方法，Quoc Le 和 Tomas Mikolov 提出了 Doc2Vec方法。除了增加一个段落向量以外，这个方法几乎等同于 Word2Vec。和 Word2Vec 一样，该模型也存在两种方法：Distributed Memory(DM) 和 Distributed Bag of Words(DBOW)。DM 试图在给定上下文和段落向量的情况下预测单词的概率。在一个句子或者文档的训练过程中，段落 ID 保持不变，共享着同一个段落向量。DBOW 则在仅给定段落向量的情况下预测段落中一组随机单词的概率。
\ No newline at end of file
diff --git a/draft/data/corpus_test/t2.txt b/draft/data/corpus_test/t2.txt
new file mode 100644
index 0000000..3d264a8
--- /dev/null
+++ b/draft/data/corpus_test/t2.txt
@@ -0,0 +1 @@
+训练过程中新增了paragraph id，即训练语料中每个句子都有一个唯一的id。paragraph id和普通的word一样，也是先映射成一个向量，即paragraph vector。paragraph vector与word vector的维数虽一样，但是来自于两个不同的向量空间。在之后的计算里，paragraph vector和word vector累加或者连接起来，作为输出层softmax的输入。在一个句子或者文档的训练过程中，paragraph id保持不变，共享着同一个paragraph vector，相当于每次在预测单词的概率时，都利用了整个句子的语义。
\ No newline at end of file
diff --git a/draft/data/corpus_words/artice1.txt b/draft/data/corpus_words/artice1.txt
new file mode 100644
index 0000000..4b65fdd
--- /dev/null
+++ b/draft/data/corpus_words/artice1.txt
@@ -0,0 +1 @@
+然而 ， 即使 上述 模型 对词 向量 进行 平均 处理 ， 我们 仍然 忽略 了 单词 之间 的 排列 顺序 对 情感 分析 的 影响 。 即 上述 的 word2vec 只是 基于 词 的 维度 进行 " 语义 分析 " 的 ， 而 并 不 具有 上下文 的 " 语义 分析 " 能力 。 作为 一个 处理 可变 长度 文本 的 总结性 方法 ， QuocLe 和 TomasMikolov 提出 了 Doc2Vec 方法 。 除了 增加 一个 段落 向量 以外 ， 这个 方法 几乎 等同于 Word2Vec 。 和 Word2Vec 一样 ， 该 模型 也 存在 两种 方法 ： DistributedMemory ( DM ) 和 DistributedBagofWords ( DBOW ) 。 DM 试图 在 给定 上下文 和 段落 向量 的 情况 下 预测 单词 的 概率 。 在 一个 句子 或者 文档 的 训练 过程 中 ， 段落 ID 保持 不变 ， 共享 着 同一个 段落 向量 。 DBOW 则 在 仅 给定 段落 向量 的 情况 下 预测 段落 中 一组 随机 单词 的 概率 。 然而 ， 即使 上述 模型 对词 向量 进行 平均 处理 ， 我们 仍然 忽略 了 单词 之间 的 排列 顺序 对 情感 分析 的 影响 。 即 上述 的 word2vec 只是 基于 词 的 维度 进行 " 语义 分析 " 的 ， 而 并 不 具有 上下文 的 " 语义 分析 " 能力 。 作为 一个 处理 可变 长度 文本 的 总结性 方法 ， QuocLe 和 TomasMikolov 提出 了 Doc2Vec 方法 。 除了 增加 一个 段落 向量 以外 ， 这个 方法 几乎 等同于 Word2Vec 。 和 Word2Vec 一样 ， 该 模型 也 存在 两种 方法 ： DistributedMemory ( DM ) 和 DistributedBagofWords ( DBOW ) 。 DM 试图 在 给定 上下文 和 段落 向量 的 情况 下 预测 单词 的 概率 。 在 一个 句子 或者 文档 的 训练 过程 中 ， 段落 ID 保持 不变 ， 共享 着 同一个 段落 向量 。 DBOW 则 在 仅 给定 段落 向量 的 情况 下 预测 段落 中 一组 随机 单词 的 概率 。 
\ No newline at end of file
diff --git a/draft/data/corpus_words/article2.txt b/draft/data/corpus_words/article2.txt
new file mode 100644
index 0000000..a34839c
--- /dev/null
+++ b/draft/data/corpus_words/article2.txt
@@ -0,0 +1 @@
+训练 过程 中 新增 了 paragraphid ， 即 训练 语料 中 每个 句子 都 有 一个 唯一 的 id 。 paragraphid 和 普通 的 word 一样 ， 也 是 先 映射 成 一个 向量 ， 即 paragraphvector 。 paragraphvector 与 wordvector 的 维数 虽 一样 ， 但是 来自 于 两个 不同 的 向量 空间 。 在 之后 的 计算 里 ， paragraphvector 和 wordvector 累加 或者 连接起来 ， 作为 输出 层 softmax 的 输入 。 在 一个 句子 或者 文档 的 训练 过程 中 ， paragraphid 保持 不变 ， 共享 着 同一个 paragraphvector ， 相当于 每次 在 预测 单词 的 概率 时 ， 都 利用 了 整个 句子 的 语义 。 
\ No newline at end of file
diff --git a/draft/doc2vector.py b/draft/doc2vector.py
new file mode 100644
index 0000000..0492268
--- /dev/null
+++ b/draft/doc2vector.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jan 27 14:20:16 2019
+
+@author: liudiwei
+"""
+
+# -*- coding: utf-8 -*-
+import codecs
+import re
+from os import listdir
+
+import gensim
+import jieba
+import numpy as np
+import pandas as pd
+import os
+
+def doc_segment(doc_path, corpus_path):
+    """save word segment
+    """
+    # 先把所有文档的路径存进一个 array 中，docLabels：
+    doc_lists = [file for file in listdir(doc_path) if file.endswith('.txt')]
+
+    for doc in doc_lists:
+        try:
+            ws = codecs.open(doc_path + "/" + doc, encoding="utf8").read()
+            doc_words = segment(ws)
+            if not os.path.exists(corpus_path):
+                os.mkdir(corpus_path)
+            with codecs.open(corpus_path + "/{}".format(doc), "a", encoding="UTF-8") as f:
+                f.write(" ".join(doc_words))
+        except:
+            print(doc)
+
+def segment(doc: str, stopword_file=None):
+    """中文分词
+    parameter:
+        doc : str, input text
+    return:
+        [type] --- [description]
+    """
+    # 停用词
+    if stopword_file != None:
+        stop_words = pd.read_csv(stopword_file, 
+                                 index_col=False, 
+                                 quoting=3,
+                                 names=['stopword'],
+                                 sep="\n",
+                                 encoding='utf-8')
+        
+        stop_words = list(stop_words.stopword)
+    else:
+        stop_words = []
+    reg_html = re.compile(r'<[^>]+>', re.S)
+    doc = reg_html.sub('', doc)
+    doc = re.sub('[0-9]', '', doc)
+    doc = re.sub('\s', '', doc)
+    word_list = list(jieba.cut(doc))
+    out_str = ''
+    for word in word_list:
+        if word not in stop_words:
+            out_str += word
+            out_str += ' '
+    segments = out_str.split(sep=" ")
+    return segments
+
+def build_corpus(corpus_path):
+    """build word corpus: list of list
+    """
+    doc_labels = [f for f in os.listdir(corpus_path) if f.endswith('.txt')]
+
+    corpus = []
+    for doc in doc_labels:
+        ws = open(corpus_path + "/" + doc, 'r', encoding='UTF-8').read()
+        corpus.append(ws)
+
+    print("corpus size: ", len(corpus))
+    return corpus, doc_labels
+
+############################## build model ####################################
+
+def train_model(corpus, doc_labels, model_path, model_name="doc2vec.model"):    
+    """training model
+    parameter:
+        - courpus: [[...], [....]]
+        - doc_labels: [...]
+        - model_path
+        - model_name: default value "doc2vec.model"
+    return:
+        - model: model
+        - model_file: model_path + "/" + model_name
+    """
+    # training doc2vec model and save model to local disk：
+    sentences = LabeledLineSentence(corpus, doc_labels)
+    # an empty model
+    model = gensim.models.Doc2Vec(vector_size=256, 
+                                  window=10, 
+                                  min_count=5,
+                                  workers=4, 
+                                  alpha=0.025, 
+                                  min_alpha=0.025, 
+                                  epochs=12)
+    model.build_vocab(sentences)
+    
+    print("start training...")
+    model.train(sentences, total_examples = model.corpus_count, epochs=12)
+    
+    if not os.path.exists(model_path):
+        os.mkdir(model_path)
+    model_file = model_path + "/" + model_name
+    model.save(model_file)
+    print("Model saved")
+    return model, model_file
+
+def test_model(model_file, file1, file2):
+    print("Loading Model.")
+    model = gensim.models.Doc2Vec.load(model_file)
+
+    sentence1 = open(file1, 'r', encoding='UTF-8').read()
+    sentence2 = open(file2, 'r', encoding='UTF-8').read()
+    
+    # 分词
+    print("start to segment")
+    words1 = segment(sentence1)
+    words2 = segment(sentence2)
+    
+    # 转成句子向量
+    vector1 = sent2vec(model, words1)
+    vector2 = sent2vec(model, words2)
+
+    import sys
+    print(sys.getsizeof(vector1))
+    print(sys.getsizeof(vector2))
+
+    cos = similarity(vector1, vector2)
+    print("相似度：{:.4f}".format(cos))
+
+
+def similarity(a_vect, b_vect):
+    """计算两个向量余弦值
+    parameter:
+        a_vect {[type]} -- a 向量
+        b_vect {[type]} -- b 向量
+    
+    return:
+        [type] -- [description]
+    """
+
+    dot_val = 0.0
+    a_norm = 0.0
+    b_norm = 0.0
+    cos = None
+    for a, b in zip(a_vect, b_vect):
+        dot_val += a*b
+        a_norm += a**2
+        b_norm += b**2
+    if a_norm == 0.0 or b_norm == 0.0:
+        cos = -1
+    else:
+        cos = dot_val / ((a_norm*b_norm)**0.5)
+
+    return cos
+
+
+def sent2vec(model, words):
+    """sentence2vector
+    parameter:
+        model {[type]} -- Doc2Vec 模型
+        words {[type]} -- 分词后的文本
+    return:
+        [type] -- 向量数组
+    """
+    vect_list = []
+    for w in words:
+        try:
+            vect_list.append(model.wv[w])
+        except:
+            continue
+    vect_list = np.array(vect_list)
+    vect = vect_list.sum(axis=0)
+    return vect / np.sqrt((vect ** 2).sum())
+
+
+
+class LabeledLineSentence(object):
+    def __init__(self, doc_list, labels_list):
+       self.labels_list = labels_list
+       self.doc_list = doc_list
+
+    def __iter__(self):
+        for idx, doc in enumerate(self.doc_list):
+            yield gensim.models.doc2vec.TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])
+
+
+if __name__ == '__main__':
+    doc_path = "./data/"
+    corpus_path = "data/corpus_words"
+    model_path = "./models"
+    #doc_segment(data_dir)
+    corpus, doc_labels = build_corpus(corpus_path)
+    model, model_file = train_model(corpus, doc_labels, model_path)
+    test_model(model_file, './data/corpus_test/t2.txt', './data/corpus_test/t1.txt')

From 053a428e5d5c7d171b05096b47ed50be87142da6 Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 27 Jan 2019 23:32:13 +0800
Subject: [PATCH 03/17] update readme

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a74bc1b..f9e0e34 100644
--- a/README.md
+++ b/README.md
@@ -38,10 +38,9 @@
 
 ## Contributor
 
-- 刘帝伟, 中南大学2014级硕士，[HomePage](http://www.csuldw.com).
+- 刘帝伟, 中南大学14级硕士，关注AI、机器学习、深度学习方向，[HomePage](http://www.csuldw.com).
 
 
 ## Contact
 
-- QQ: 466454368
 - E-mail: csu.ldw@csu.edu.cn

From f0985fc6125abefdb4888e24e411ec728a2848ac Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 27 Jan 2019 23:44:39 +0800
Subject: [PATCH 04/17] add fm_demo

---
 draft/jupyter/fm_demo.ipynb | 555 ++++++++++++++++++++++++++++++++++++
 1 file changed, 555 insertions(+)
 create mode 100644 draft/jupyter/fm_demo.ipynb

diff --git a/draft/jupyter/fm_demo.ipynb b/draft/jupyter/fm_demo.ipynb
new file mode 100644
index 0000000..f288396
--- /dev/null
+++ b/draft/jupyter/fm_demo.ipynb
@@ -0,0 +1,555 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.chdir(r\"F:\\CSU\\Github\\nlp_exp\\fm_exp\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import count \n",
+    "from collections import defaultdict\n",
+    "from scipy.sparse import csr    \n",
+    "from __future__ import print_function \n",
+    "\n",
+    "def vectorize_dic(dic, ix=None, p=None):\n",
+    "    \"\"\" \n",
+    "    Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) \n",
+    "    \n",
+    "    parameters:\n",
+    "    -----------\n",
+    "    dic -- dictionary of feature lists. Keys are the name of features\n",
+    "    ix -- index generator (default None)\n",
+    "    p -- dimension of featrure space (number of columns in the sparse matrix) (default None)\n",
+    "    \"\"\"\n",
+    "    if (ix == None):\n",
+    "        d = count(0)\n",
+    "        ix = defaultdict(lambda: next(d)) \n",
+    "        \n",
+    "    n = len(list(dic.values())[0]) # num samples\n",
+    "    g = len(list(dic.keys())) # num groups\n",
+    "    nz = n * g # number of non-zeros\n",
+    "\n",
+    "    col_ix = np.empty(nz, dtype=int)     \n",
+    "    \n",
+    "    i = 0\n",
+    "    for k, lis in dic.items():     \n",
+    "        # append index el with k in order to prevet mapping different columns with same id to same index\n",
+    "        col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]\n",
+    "        i += 1\n",
+    "        \n",
+    "    row_ix = np.repeat(np.arange(0, n), g)      \n",
+    "    data = np.ones(nz)\n",
+    "    \n",
+    "    if (p == None):\n",
+    "        p = len(ix)\n",
+    "        \n",
+    "    ixx = np.where(col_ix < p)\n",
+    "\n",
+    "    return csr.csr_matrix((data[ixx],(row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 加载数据集"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user</th>\n",
+       "      <th>item</th>\n",
+       "      <th>rating</th>\n",
+       "      <th>timestamp</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>874965758</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>876893171</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "      <td>878542960</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>3</td>\n",
+       "      <td>876893119</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>889751712</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>5</td>\n",
+       "      <td>887431973</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>4</td>\n",
+       "      <td>875071561</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>1</td>\n",
+       "      <td>8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>875072484</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>5</td>\n",
+       "      <td>878543541</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1</td>\n",
+       "      <td>10</td>\n",
+       "      <td>3</td>\n",
+       "      <td>875693118</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   user  item  rating  timestamp\n",
+       "0     1     1       5  874965758\n",
+       "1     1     2       3  876893171\n",
+       "2     1     3       4  878542960\n",
+       "3     1     4       3  876893119\n",
+       "4     1     5       3  889751712\n",
+       "5     1     6       5  887431973\n",
+       "6     1     7       4  875071561\n",
+       "7     1     8       1  875072484\n",
+       "8     1     9       5  878543541\n",
+       "9     1    10       3  875693118"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.feature_extraction import DictVectorizer\n",
+    "\n",
+    "# laod data with pandas\n",
+    "cols = ['user', 'item', 'rating', 'timestamp']\n",
+    "train = pd.read_csv('data/ua.base', delimiter='\\t', names=cols)\n",
+    "test = pd.read_csv('data/ua.test', delimiter='\\t', names=cols)\n",
+    "train.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### vectorize data and convert them to csr matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, ix = vectorize_dic({'users': train.user.values, 'items': train.item.values})\n",
+    "X_test, ix = vectorize_dic({'users': test.user.values, 'items': test.item.values}, ix, X_train.shape[1])\n",
+    "y_train = train.rating.values\n",
+    "y_test= test.rating.values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(90570, 2623)\n",
+      "(9430, 2623)\n"
+     ]
+    }
+   ],
+   "source": [
+    "X_train = X_train.todense()\n",
+    "X_test = X_test.todense()\n",
+    "\n",
+    "# print shape of data\n",
+    "print(X_train.shape)\n",
+    "print(X_test.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define FM Model with tensorflow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "\n",
+    "#get x_train shape\n",
+    "n, p = X_train.shape\n",
+    "\n",
+    "# number of latent factors\n",
+    "k = 10\n",
+    "\n",
+    "# design matrix\n",
+    "X = tf.placeholder('float', shape=[None, p])\n",
+    "# target vector\n",
+    "y = tf.placeholder('float', shape=[None, 1])\n",
+    "\n",
+    "# bias and weights\n",
+    "w0 = tf.Variable(tf.zeros([1]))\n",
+    "W = tf.Variable(tf.zeros([p]))\n",
+    "\n",
+    "# interaction factors, randomly initialized \n",
+    "V = tf.Variable(tf.random_normal([k, p], stddev=0.01))\n",
+    "\n",
+    "# estimate of y, initialized to 0.\n",
+    "y_hat = tf.Variable(tf.zeros([n, 1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/latex": [
+       "$$\\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)$$"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Math object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from IPython.display import display, Math, Latex\n",
+    "\n",
+    "display(Math(r'\\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate output with FM equation\n",
+    "linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, X), 1, keep_dims=True))\n",
+    "pair_interactions = (tf.multiply(0.5,\n",
+    "                    tf.reduce_sum(\n",
+    "                        tf.subtract(\n",
+    "                            tf.pow( tf.matmul(X, tf.transpose(V)), 2),\n",
+    "                            tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(V, 2)))),\n",
+    "                        1, keep_dims=True)))\n",
+    "y_hat = tf.add(linear_terms, pair_interactions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/latex": [
+       "$$L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2$$"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Math object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(Math(r'L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2'))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# L2 regularized sum of squares loss function over W and V\n",
+    "lambda_w = tf.constant(0.001, name='lambda_w')\n",
+    "lambda_v = tf.constant(0.001, name='lambda_v')\n",
+    "\n",
+    "l2_norm = (tf.reduce_sum(\n",
+    "            tf.add(\n",
+    "                tf.multiply(lambda_w, tf.pow(W, 2)),\n",
+    "                tf.multiply(lambda_v, tf.pow(V, 2)))))\n",
+    "\n",
+    "error = tf.reduce_mean(tf.square(tf.subtract(y, y_hat)))\n",
+    "loss = tf.add(error, l2_norm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/latex": [
+       "$$\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}$$"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Math object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(Math(r'\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}'))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def batcher(X_, y_=None, batch_size=-1):\n",
+    "    n_samples = X_.shape[0]\n",
+    "\n",
+    "    if batch_size == -1:\n",
+    "        batch_size = n_samples\n",
+    "    if batch_size < 1:\n",
+    "       raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))\n",
+    "\n",
+    "    for i in range(0, n_samples, batch_size):\n",
+    "        upper_bound = min(i + batch_size, n_samples)\n",
+    "        ret_x = X_[i:upper_bound]\n",
+    "        ret_y = None\n",
+    "        if y_ is not None:\n",
+    "            ret_y = y_[i:i + batch_size]\n",
+    "            yield (ret_x, ret_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a8247d26eae64d09b463eab3875f13c5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "\n",
+    "epochs = 10\n",
+    "batch_size = 1000\n",
+    "\n",
+    "# Launch the graph\n",
+    "init = tf.global_variables_initializer()\n",
+    "sess = tf.Session()\n",
+    "\n",
+    "sess.run(init)\n",
+    "\n",
+    "for epoch in tqdm(range(epochs), unit='epoch'):\n",
+    "    perm = np.random.permutation(X_train.shape[0])\n",
+    "    # iterate over batches\n",
+    "    for bX, bY in batcher(X_train[perm], y_train[perm], batch_size):\n",
+    "        sess.run(optimizer, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.113785\n"
+     ]
+    }
+   ],
+   "source": [
+    "errors = []\n",
+    "for bX, bY in batcher(X_test, y_test):\n",
+    "    errors.append(sess.run(error, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)}))\n",
+    "\n",
+    "RMSE = np.sqrt(np.array(errors).mean())\n",
+    "print(RMSE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sess.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[1.2405171]"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "errors"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 0f1a7ff8bbebdc925462e37b67e927d23e555a02 Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Mon, 28 Jan 2019 00:25:48 +0800
Subject: [PATCH 05/17] add pyfm_demo

---
 draft/pyfm_demo.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 draft/pyfm_demo.py

diff --git a/draft/pyfm_demo.py b/draft/pyfm_demo.py
new file mode 100644
index 0000000..5c22b1a
--- /dev/null
+++ b/draft/pyfm_demo.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jan 27 23:49:24 2019
+pyfm安装：将https://github.com/coreylynch/pyFM 下载到本地，去掉setup.py里面的
+libraries=["m"],然后安装即可.
+@author: liudiwei
+"""
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from pyfm import pylibfm
+from sklearn.feature_extraction import DictVectorizer
+
+iris_data = load_iris()
+X = iris_data['data']
+y = iris_data['target'] == 2
+data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()}  for i in X]
+
+X_train,X_test,y_train, y_test = train_test_split(data,y,test_size=0.3,random_state=0)
+
+v = DictVectorizer()
+X_train = v.fit_transform(X_train)
+X_test = v.transform(X_test)
+
+fm = pylibfm.FM(num_factors=50, 
+                num_iter=1000, 
+                verbose=True, 
+                task="classification", 
+                initial_learning_rate=0.0001, 
+                learning_rate_schedule="optimal")
+
+fm.fit(X_train, y_train)
+
+
+y_preds = fm.predict(X_test)
+from sklearn.metrics import log_loss
+print ("Validation log loss: %.4f" % log_loss(y_test, y_preds))
+

From 49cb0eca8b5c881ddf6f80d3910edf47a173bc4d Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Mon, 28 Jan 2019 23:56:34 +0800
Subject: [PATCH 06/17] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f9e0e34..6ed389f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 ## MachineLearning
 
-机器学习算法代码及个人总结整理，对于算法实现部分，在相应目录中都包含有源码和数据以及测试实例，内容正在不断完善中！如有错误，望不吝指教。PS:所有代码均符合我们整理出来的这份[编码规范](https://github.com/csuldw/MachineLearning/blob/master/Python-coding-standards.md).
+机器学习算法代码及个人总结整理，对于算法实现部分，在相应目录中都包含有源码和数据以及测试实例，内容正在不断完善中！如有错误，还望读者指出，非常感谢，若您觉得对你有帮助，可以在右上角给个star哈(#^.^#)。PS:所有代码均符合我们整理出来的这份[编码规范](https://github.com/csuldw/MachineLearning/blob/master/Python-coding-standards.md).
 
 ## Contents
 

From baacdbe9d9a17b302b0ef177e53b4b2a89bee4ed Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 17 Feb 2019 00:36:01 +0800
Subject: [PATCH 07/17] add rs

---
 {draft => Recommendation System}/data_process/user_keywords.csv | 0
 {draft => Recommendation System}/recommend.py                   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {draft => Recommendation System}/data_process/user_keywords.csv (100%)
 rename {draft => Recommendation System}/recommend.py (100%)

diff --git a/draft/data_process/user_keywords.csv b/Recommendation System/data_process/user_keywords.csv
similarity index 100%
rename from draft/data_process/user_keywords.csv
rename to Recommendation System/data_process/user_keywords.csv
diff --git a/draft/recommend.py b/Recommendation System/recommend.py
similarity index 100%
rename from draft/recommend.py
rename to Recommendation System/recommend.py

From b89e2af410e7cc86303f89eaa037b48564219c19 Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 17 Feb 2019 19:32:31 +0800
Subject: [PATCH 08/17] move pyFM demo

---
 Recommendation System/pyfm_demo.py | 47 ++++++++++++++++++++++++++++++
 draft/pyfm_demo.py                 | 37 -----------------------
 2 files changed, 47 insertions(+), 37 deletions(-)
 create mode 100644 Recommendation System/pyfm_demo.py
 delete mode 100644 draft/pyfm_demo.py

diff --git a/Recommendation System/pyfm_demo.py b/Recommendation System/pyfm_demo.py
new file mode 100644
index 0000000..3787809
--- /dev/null
+++ b/Recommendation System/pyfm_demo.py	
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jan 27 23:49:24 2019
+
+pyfm install：将https://github.com/coreylynch/pyFM 下载到本地，去掉setup.py里面的
+libraries=["m"],然后安装即可.
+
+@author: liudiwei
+"""
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from pyfm import pylibfm
+from sklearn.feature_extraction import DictVectorizer
+
+def load_data():
+    """
+    调用sklearn的iris数据集，筛选正负样本并构造切分训练测试数据集
+    """
+    iris_data = load_iris()
+    X = iris_data['data']
+    y = iris_data['target'] == 2
+    data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()}  for i in X]
+    
+    X_train,X_test,y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=0)
+    return X_train,X_test,y_train, y_test
+
+X_train,X_test,y_train, y_test = load_data()
+
+v = DictVectorizer()
+X_train = v.fit_transform(X_train)
+X_test = v.transform(X_test)
+
+fm = pylibfm.FM(num_factors=100, 
+                num_iter=200, 
+                verbose=True, 
+                task="classification", 
+                initial_learning_rate=0.001, 
+                learning_rate_schedule="optimal")
+
+fm.fit(X_train, y_train)
+
+y_preds = fm.predict(X_test)
+y_preds_label = y_preds > 0.5
+from sklearn.metrics import log_loss,accuracy_score
+print ("Validation log loss: %.4f" % log_loss(y_test, y_preds))
+print ("accuracy: %.4f" % accuracy_score(y_test, y_preds_label))
+
diff --git a/draft/pyfm_demo.py b/draft/pyfm_demo.py
deleted file mode 100644
index 5c22b1a..0000000
--- a/draft/pyfm_demo.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Jan 27 23:49:24 2019
-pyfm安装：将https://github.com/coreylynch/pyFM 下载到本地，去掉setup.py里面的
-libraries=["m"],然后安装即可.
-@author: liudiwei
-"""
-from sklearn.datasets import load_iris
-from sklearn.model_selection import train_test_split
-from pyfm import pylibfm
-from sklearn.feature_extraction import DictVectorizer
-
-iris_data = load_iris()
-X = iris_data['data']
-y = iris_data['target'] == 2
-data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()}  for i in X]
-
-X_train,X_test,y_train, y_test = train_test_split(data,y,test_size=0.3,random_state=0)
-
-v = DictVectorizer()
-X_train = v.fit_transform(X_train)
-X_test = v.transform(X_test)
-
-fm = pylibfm.FM(num_factors=50, 
-                num_iter=1000, 
-                verbose=True, 
-                task="classification", 
-                initial_learning_rate=0.0001, 
-                learning_rate_schedule="optimal")
-
-fm.fit(X_train, y_train)
-
-
-y_preds = fm.predict(X_test)
-from sklearn.metrics import log_loss
-print ("Validation log loss: %.4f" % log_loss(y_test, y_preds))
-

From 95802acb59603ac1bc3885e3db7d50715e58bca3 Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 17 Feb 2019 19:39:07 +0800
Subject: [PATCH 09/17] update pyFM demo

---
 Recommendation System/pyfm_demo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Recommendation System/pyfm_demo.py b/Recommendation System/pyfm_demo.py
index 3787809..1ea5d9e 100644
--- a/Recommendation System/pyfm_demo.py	
+++ b/Recommendation System/pyfm_demo.py	
@@ -30,8 +30,8 @@ def load_data():
 X_train = v.fit_transform(X_train)
 X_test = v.transform(X_test)
 
-fm = pylibfm.FM(num_factors=100, 
-                num_iter=200, 
+fm = pylibfm.FM(num_factors=1, 
+                num_iter=500, 
                 verbose=True, 
                 task="classification", 
                 initial_learning_rate=0.001, 

From 66bffed11fadef19b2d4996c5ef01b8de31cd51c Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 17 Feb 2019 20:02:01 +0800
Subject: [PATCH 10/17] update pyFM demo

---
 Recommendation System/pyfm_demo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Recommendation System/pyfm_demo.py b/Recommendation System/pyfm_demo.py
index 1ea5d9e..7c634f5 100644
--- a/Recommendation System/pyfm_demo.py	
+++ b/Recommendation System/pyfm_demo.py	
@@ -30,8 +30,8 @@ def load_data():
 X_train = v.fit_transform(X_train)
 X_test = v.transform(X_test)
 
-fm = pylibfm.FM(num_factors=1, 
-                num_iter=500, 
+fm = pylibfm.FM(num_factors=2, 
+                num_iter=200, 
                 verbose=True, 
                 task="classification", 
                 initial_learning_rate=0.001, 

From 897267b8609d6bc6b5815a3eb80c5f9f8ccb197f Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 1 Sep 2019 14:26:19 +0800
Subject: [PATCH 11/17] add wx photo

---
 .DS_Store                                        | Bin 0 -> 6148 bytes
 README.md                                        |   8 +++++++-
 ...xperiments with a New Boosting Algorithm.pdf} | Bin
 3 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 .DS_Store
 rename doc/{boostingexperiments.pdf => 1996 Experiments with a New Boosting Algorithm.pdf} (100%)

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..bffc98b573f7e579f5881366afafadada05bde95
GIT binary patch
literal 6148
zcmeHK%}N6?5T3ME)2i5mpx*Z4tq0eC55iLG!JDw62Nm75i!RiSbhj3*m3<C<BVWMR
zaVAN{QoV^tnSseSnam{Q%aYjufT(wxE<hCkBr0K{jLkPf<D?@}upUC8=SU$21qmEM
zdnB42|B(SYyBTOe1Xp-=U(YXsDD0+pVM~P=^A_H<;xt=Xc^8FZX<~9p%8IPaR?ht!
z)%CL=>!kIdeM-H<AiD{zeWX<D(+<MpIO<+h=l4{S1#uF!b#mN{Fy!ncPMWG)SDmDp
z>Un%KAZ1yWFRDwu-iEtXb5?h@`!%QMx?9*;U)${W%W`4a-95OxeR%9W^`Bo1%>utj
zEprwJ@QTKdJHGeZNlPWy7&FKp<S{Y>%m6bmJ`C6!PEL={mUt)305kAQ4AA}Hpb~l(
zbA$Toz=nPwX}m;8f;PP+2px-_#oQo{pa_$SXi|lJVhEFte#gdn7IT9p9fV#P=dmjb
z`$7?Vb@V$r9E4|(TV{Y67-pbox>c(G`#-<`hm*L+3@`&@#egU^{6-zOWNYil=BU;>
tsF$cD6qg(POhH4JVvMCyyojm={SFz3p2gfCdQkXBK-0htGw`Pjd;qX|ReS&d

literal 0
HcmV?d00001

diff --git a/README.md b/README.md
index 6ed389f..b158257 100644
--- a/README.md
+++ b/README.md
@@ -38,9 +38,15 @@
 
 ## Contributor
 
-- 刘帝伟, 中南大学14级硕士，关注AI、机器学习、深度学习方向，[HomePage](http://www.csuldw.com).
+- 刘帝伟, CSU硕士毕业，关注AI、机器学习、深度学习方向，[HomePage](http://www.csuldw.com).
 
 
 ## Contact
 
+如果有任何疑问，可在我的微信公众号后台留言：
+
+![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png)
+
+或是发邮件吧：
+
 - E-mail: csu.ldw@csu.edu.cn
diff --git a/doc/boostingexperiments.pdf b/doc/1996 Experiments with a New Boosting Algorithm.pdf
similarity index 100%
rename from doc/boostingexperiments.pdf
rename to doc/1996 Experiments with a New Boosting Algorithm.pdf

From f9cf3f2582d1bfbb18dcc25bc1b1d1a67822a8e9 Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 1 Sep 2019 14:28:33 +0800
Subject: [PATCH 12/17] add wx photo

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index b158257..5b452ae 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,9 @@
 
 ![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png)
 
+<img src="http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png" width="65%" height="50%">
+<div class="caption">Fig 1：推荐系统整体结构.</div>
+
 或是发邮件吧：
 
 - E-mail: csu.ldw@csu.edu.cn

From 44cf0c0f576e550ff6f17ba854ff71c82f9b4ebb Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 1 Sep 2019 14:29:40 +0800
Subject: [PATCH 13/17] add wx photo

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5b452ae..0f9f865 100644
--- a/README.md
+++ b/README.md
@@ -45,10 +45,10 @@
 
 如果有任何疑问，可在我的微信公众号后台留言：
 
-![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png)
-
+<!-- ![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) -->
+<center>
 <img src="http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png" width="65%" height="50%">
-<div class="caption">Fig 1：推荐系统整体结构.</div>
+</center>
 
 或是发邮件吧：
 

From 97c5aa0811e58f8ff635797b2382fa9949f01491 Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 1 Sep 2019 14:31:08 +0800
Subject: [PATCH 14/17] add wx photo

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0f9f865..a47b09c 100644
--- a/README.md
+++ b/README.md
@@ -46,9 +46,10 @@
 如果有任何疑问，可在我的微信公众号后台留言：
 
 <!-- ![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) -->
-<center>
+
+<div style="align: center">
 <img src="http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png" width="65%" height="50%">
-</center>
+</div>
 
 或是发邮件吧：
 

From a93595a55f860fcc91acbc35d40836d275cdcee7 Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sun, 1 Sep 2019 14:32:12 +0800
Subject: [PATCH 15/17] add wx photo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a47b09c..7831611 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@
 <!-- ![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) -->
 
 <div style="align: center">
-<img src="http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png" width="65%" height="50%">
+<img src="http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png" width="50%" height="50%">
 </div>
 
 或是发邮件吧：

From 9a5daa063034878b7fd7c594a48aed0fc8a23606 Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Wed, 16 Oct 2019 00:32:12 +0800
Subject: [PATCH 16/17] add stacking model draft

---
 draft/stacking.py | 181 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 draft/stacking.py

diff --git a/draft/stacking.py b/draft/stacking.py
new file mode 100644
index 0000000..915d778
--- /dev/null
+++ b/draft/stacking.py
@@ -0,0 +1,181 @@
+from sklearn.model_selection import KFold
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_digits
+import numpy as np
+from sklearn.svm import SVC
+from sklearn import metrics
+from sklearn.ensemble import RandomForestClassifier
+from sklearn import preprocessing
+import pandas as pd
+from functools import reduce
+from sklearn.metrics import confusion_matrix, classification_report
+ 
+class StackingClassifier(object):
+    
+    def __init__(self, modellist=[], meta_classifier=None):
+        self.modellist = modellist
+        if meta_classifier == None:
+            from sklearn.linear_model import LogisticRegression
+            meta_classifier = LogisticRegression()
+        self.meta_classifier = meta_classifier
+
+    def SelectModel(self, modelname):
+    
+        if modelname == "SVM":
+            from sklearn.svm import SVC
+            model = SVC(kernel='rbf', C=16, gamma=0.125,probability=True)
+        
+        elif modelname == "lr":
+            from sklearn.linear_model import LogisticRegression
+            model = LogisticRegression()
+
+        elif modelname == "GBDT":
+            from sklearn.ensemble import GradientBoostingClassifier
+            model = GradientBoostingClassifier()
+    
+        elif modelname == "RF":
+            from sklearn.ensemble import RandomForestClassifier
+            model = RandomForestClassifier()
+    
+        elif modelname == "xgboost":
+            from xgboost import XGBClassifier
+            model = XGBClassifier(
+                    learning_rate=0.01,
+                    n_estimators=1000,
+                    max_depth=4,
+                    min_child_weight=3,
+                    gamma=0.1,
+                    subsample=0.8,
+                    colsample_bytree=0.8,
+                    reg_alpha=1,
+                    objective='binary:logistic', #multi:softmax
+                    nthread=8,
+                    scale_pos_weight=1,
+                    seed=27,
+                    random_state=27
+                )    
+        elif modelname == "KNN":
+            from sklearn.neighbors import KNeighborsClassifier as knn
+            model = knn()
+        
+        elif modelname == "MNB":
+            from sklearn.naive_bayes import MultinomialNB
+            model = MultinomialNB()
+        else:
+            pass
+        return model
+ 
+    def get_oof(self, clf, n_folds, X_train, y_train, X_test):
+        ntrain = X_train.shape[0]
+        ntest =  X_test.shape[0]
+        print("kfolds: ", ntrain, ntest)
+        classnum = len(np.unique(y_train))
+        kf = KFold(n_splits=n_folds,random_state=1)
+        oof_train = np.zeros((ntrain,classnum))
+        oof_test = np.zeros((ntest,classnum))
+    
+        for i,(train_index, test_index) in enumerate(kf.split(X_train)):
+            kf_X_train = X_train[train_index] # 数据
+            kf_y_train = y_train[train_index] # 标签
+    
+            kf_X_test = X_train[test_index]  # k-fold的验证集
+    
+            clf.fit(kf_X_train, kf_y_train)
+            oof_train[test_index] = clf.predict_proba(kf_X_test)
+            # print("shape of oof_train:", oof_train[test_index].shape)
+    
+            print("fold{i}: oof_train: {a}, oof_test:{b}".format(i=i, a=oof_train.shape, b=oof_test.shape))
+            oof_test += clf.predict_proba(X_test)
+        oof_test = oof_test/float(n_folds)
+        print("oof_train: {a}, oof_test:{b}".format(a=oof_train.shape, b=oof_test.shape))
+        return oof_train, oof_test
+
+    def first_layer(self, X_train, y_train, X_test, modellist=None):
+        """modellist 需要重新修改
+        """
+        newfeature_list = []
+        newtestdata_list = []
+        for modelname in self.modellist:
+            sub_clf = self.SelectModel(modelname)
+            oof_train_, oof_test_= self.get_oof(clf=sub_clf,
+                                                n_folds=5,
+                                                X_train=X_train,
+                                                y_train=y_train,
+                                                X_test=X_test)
+            print("oof_train: ", oof_train_.shape)
+            print("model-{}".format(modelname),len(oof_train_), len(oof_test_))
+            newfeature_list.append(oof_train_)
+            print("newfeature_list: ", len(newfeature_list))
+            newtestdata_list.append(oof_test_)
+        
+        # 特征组合
+        X_train_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newfeature_list)    
+        X_test_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newtestdata_list)
+
+        return X_train_stacking, X_test_stacking
+
+    def fit(self, X_train, y_train, clf=None):
+        if clf != None:
+            self.meta_classifier = clf
+        self.meta_classifier.fit(X_train, y_train)
+        return self.meta_classifier    
+
+    #second_layer
+    def second_layer(self, X_train, y_train, clf=None):
+        return self.fit(X_train, y_train, clf)
+    
+    def predict(self, X_test, clf=None, type="label"):
+        if clf == None:
+            clf = self.meta_classifier
+        if type == "proba":
+            return clf.predict_proba(X_test)
+        elif type == "label":
+            return clf.predict(X_test)
+    
+    def get_accuracy(self, y_true, y_pred):
+        accuracy = metrics.accuracy_score(y_true, y_pred)*100
+        return accuracy
+
+    def performance(self, y_true, y_pred):
+        accuracy = self.get_accuracy(y_true, y_pred)
+        confusion = confusion_matrix(y_true, y_pred)
+        report = classification_report(y_true, y_pred)
+        print("多模型融合预测accuracy：{}".format(accuracy))
+        print("混淆矩阵：\n{}".format(confusion))
+        print("预测结果：\n{}".format(report))
+        return confusion, report
+        
+ 
+# 使用stacking方法的时候
+# 第一级，重构特征当做第二级的训练集
+if __name__ == "__main__":
+    # 导入数据集切割训练与测试数据
+    data = load_digits()
+    data_D = preprocessing.StandardScaler().fit_transform(data.data)
+    data_L = data.target
+    X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7)
+    print(set(y_train))
+
+    # 单纯使用一个分类器的时候
+    clf_meta = RandomForestClassifier()
+    clf_meta.fit(X_train, y_train)
+    pred = clf_meta.predict(X_test)
+    accuracy = metrics.accuracy_score(y_test, pred)*100
+    print ("====================", accuracy)
+    # 91.0969793323
+
+    #layer 1：多模型融合
+    modelist = ['SVM', 'GBDT', 'RF', 'KNN']
+    stacking_clf = StackingClassifier(modelist)
+    X_train_stacking, X_test_stacking = stacking_clf.first_layer(X_train, y_train, X_test)
+    print("shape of X_train_stacking {}".format(X_train_stacking.shape))
+    print("shape of X_test_stacking {}".format(X_test_stacking.shape))
+    
+    #layer 2： 单模型训练
+    RF = stacking_clf.SelectModel(modelname="RF")
+    clf = stacking_clf.second_layer(X_train_stacking, y_train, clf=RF)
+    pred = stacking_clf.predict(X_test_stacking)
+
+    #模型评估
+    stacking_clf.performance(y_test, pred)
+    # 96.4228934817

From 5f1569ad6a3ca015b8c07f76a19e8b44462bb6cd Mon Sep 17 00:00:00 2001
From: csuldw <csu.ldw@csu.edu.cn>
Date: Sat, 19 Oct 2019 21:52:26 +0800
Subject: [PATCH 17/17] add stacking

---
 .DS_Store            | Bin 6148 -> 6148 bytes
 stacking/stacking.py | 241 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 stacking/stacking.py

diff --git a/.DS_Store b/.DS_Store
index bffc98b573f7e579f5881366afafadada05bde95..bc267768983cb470703ff7c8ba16d687b7b1fb40 100644
GIT binary patch
delta 51
zcmZoMXfc@J&&a$nU^gQp^W=X_GMhD+7#RgQ7>XH67!nzh8L}BN8S)s?H%l>}W!cQm
H@s}R}WbzGn

delta 38
ucmZoMXfc@J&&aefU^nAr0}+<VjLedoHJBI~H+wSgW0}~Xu$i6XFFyd<vkNi+

diff --git a/stacking/stacking.py b/stacking/stacking.py
new file mode 100644
index 0000000..4091365
--- /dev/null
+++ b/stacking/stacking.py
@@ -0,0 +1,241 @@
+from sklearn.model_selection import KFold
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_digits
+import numpy as np
+from sklearn.svm import SVC
+from sklearn import metrics
+from sklearn.ensemble import RandomForestClassifier
+from sklearn import preprocessing
+import pandas as pd
+from functools import reduce
+from sklearn.metrics import confusion_matrix, classification_report
+from sklearn.linear_model import LogisticRegression
+from sklearn.base import clone
+import xgboost as xgb
+
+class SubClassifier(object):
+    def __init__(self):
+        # import lightgbm as lgb
+        # import xgboost as xgb
+        # from sklearn.svm import SVC
+        # from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
+        # from sklearn.linear_model import LogisticRegression
+        # from sklearn.svm import LinearSVC
+        # clfs = {
+        #     'lr': LogisticRegression(penalty='l1', C=0.1, tol=0.0001),
+        #     'svm': LinearSVC(C=0.05, penalty='l2', dual=True),
+        #     'svm_linear': SVC(kernel='linear', probability=True),
+        #     'svm_ploy': SVC(kernel='poly', probability=True),
+        #     'bagging': BaggingClassifier(base_estimator=base_clf, n_estimators=60, max_samples=1.0, max_features=1.0,
+        #                                 random_state=1, n_jobs=1, verbose=1),
+        #     'rf': RandomForestClassifier(n_estimators=40, criterion='gini', max_depth=9),
+        #     'adaboost': AdaBoostClassifier(base_estimator=base_clf, n_estimators=50, algorithm='SAMME'),
+        #     'gbdt': GradientBoostingClassifier(),
+        #     'xgb': xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=50),
+        #     'lgb': lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.01, max_depth=5, n_estimators=250, num_leaves=90)
+        # }
+        pass
+
+    def SelectModel(self, modelname):
+        if modelname == "SVM":
+            from sklearn.svm import SVC
+            clf = SVC(kernel='rbf', C=16, gamma=0.125,probability=True)
+        
+        elif modelname == "lr":
+            from sklearn.linear_model import LogisticRegression
+            clf = LogisticRegression()
+
+        elif modelname == "GBDT":
+            from sklearn.ensemble import GradientBoostingClassifier
+            clf = GradientBoostingClassifier()
+    
+        elif modelname == "RF":
+            from sklearn.ensemble import RandomForestClassifier
+            clf = RandomForestClassifier(n_estimators=100)
+    
+        elif modelname == "xgboost":
+            from xgboost import XGBClassifier
+            clf = XGBClassifier(
+                    learning_rate=0.01,
+                    n_estimators=1000,
+                    max_depth=4,
+                    min_child_weight=3,
+                    gamma=0.1,
+                    subsample=0.8,
+                    colsample_bytree=0.8,
+                    reg_alpha=1,
+                    objective='binary:logistic', #multi:softmax
+                    nthread=8,
+                    scale_pos_weight=1,
+                    seed=27,
+                    random_state=27
+                )    
+        elif modelname == "KNN":
+            from sklearn.neighbors import KNeighborsClassifier as knn
+            clf = knn()
+        
+        elif modelname == "MNB":
+            from sklearn.naive_bayes import MultinomialNB
+            clf = MultinomialNB()
+        else:
+            pass
+        return clf
+    
+    def performance(self, y_true, y_pred, modelname=""):
+        accuracy = metrics.accuracy_score(y_true, y_pred)*100
+        confusion = confusion_matrix(y_true, y_pred)
+        report = classification_report(y_true, y_pred)
+        print("模型{}预测accuracy：{}".format(modelname, accuracy))
+        print("混淆矩阵：\n{}".format(confusion))
+        print("预测结果：\n{}".format(report))
+        return confusion, report
+
+
+class StackingClassifier(object):
+    
+    def __init__(self, classifiers, meta_classifier,
+                use_clones=True, n_folds=2,
+                n_classes=2, random_state=100,
+                sample_weight=None, use_probas=True):
+
+        self.classifiers = classifiers
+        self.meta_classifier = meta_classifier
+        self.use_clones=use_clones
+        self.n_folds = n_folds
+        self.n_classes = n_classes
+        self.random_state = random_state
+        self.sample_weight = sample_weight
+        self.use_probas = use_probas
+
+    def cross_valid_oof(self, clf, X, y, n_folds):
+        """返回CV预测结果
+        """
+        ntrain = X.shape[0]
+        n_classes = self.n_classes
+        random_state = self.random_state
+        oof_features = np.zeros((ntrain, n_classes))
+        oof_pred = np.zeros(ntrain)
+        kf = KFold(n_splits=n_folds, random_state=random_state)
+        for i,(train_index, test_index) in enumerate(kf.split(X)):
+            kf_X_train = X[train_index] # 数据
+            kf_y_train = y[train_index] # 标签
+    
+            kf_X_test = X[test_index]  # k-fold的验证集
+    
+            clf.fit(kf_X_train, kf_y_train)
+            if not self.use_probas:
+                oof_features[test_index] = clf.predict(kf_X_test)
+            else:
+                oof_features[test_index] = clf.predict_proba(kf_X_test)
+            oof_pred[test_index] = clf.predict(kf_X_test)
+            print("fold-{i}: oof_features: {a}, cv-oof accuracy:{c}".format(i=i, 
+                                            a=oof_features.shape,
+                                            c=self.get_accuracy(y[test_index], oof_pred[test_index])))
+        return oof_features
+
+    def fit(self, X, y):
+        self.clfs_ = self.classifiers
+        self.meta_clf_ = self.meta_classifier
+            
+        n_folds = self.n_folds
+        sample_weight = self.sample_weight
+        meta_features = None
+
+        #feature layer
+        for name, sub_clf in self.clfs_.items():
+            print("feature layer, current model: {}".format(name))
+            meta_prediction = self.cross_valid_oof(sub_clf, X, y, n_folds)
+            if meta_features is None:
+                meta_features = meta_prediction
+            else:
+                meta_features = np.column_stack((meta_features, meta_prediction))
+
+        for name, model in self.clfs_.items():
+            print("fit base model using all train set: {}".format(name))
+            if sample_weight is None:
+                model.fit(X, y)
+            else:
+                model.fit(X, y, sample_weight=sample_weight)
+
+        #meta layer
+        if sample_weight is None:
+            self.meta_clf_.fit(meta_features, y)
+        else:
+            self.meta_clf_.fit(meta_features, y, sample_weight=sample_weight)
+
+        return self
+
+    def predict_meta_features(self, X):
+        """ Get meta-features of test-data.
+        Parameters
+        -------
+        X : numpy array, shape = [n_samples, n_features]
+
+        Returns:
+        -------
+        meta-features : numpy array, shape = [n_samples, n_classifiers]
+        """
+        per_model_preds = []
+
+        for name, model in self.clfs_.items():
+            print("model {} predict_meta_features".format(name))
+            if not self.use_probas:
+                pred_score = model.predict(X)
+            else:
+                pred_score = model.predict_proba(X)
+
+            per_model_preds.append(pred_score)
+
+        return np.hstack(per_model_preds)
+
+
+    def predict(self, X):
+        """ Predict class label for X."""
+        meta_features = self.predict_meta_features(X)
+        return self.meta_clf_.predict(meta_features)
+
+    def predict_prob(self, X):
+        """ Predict class probabilities for X."""
+        meta_features = self.predict_meta_features(X)
+        return self.meta_clf_.predict_proba(meta_features)
+    
+    def get_accuracy(self, y_true, y_pred):
+        accuracy = round(metrics.accuracy_score(y_true, y_pred)*100,3)
+        return accuracy
+
+    def performance(self, y_true, y_pred):
+        accuracy = self.get_accuracy(y_true, y_pred)
+        confusion = confusion_matrix(y_true, y_pred)
+        report = classification_report(y_true, y_pred)
+        print("多模型融合预测accuracy：{}".format(accuracy))
+        print("混淆矩阵：\n{}".format(confusion))
+        print("预测结果：\n{}".format(report))
+        return confusion, report
+        
+# 使用stacking方法的时候
+if __name__ == "__main__":
+    # 导入数据集切割训练与测试数据
+    data = load_digits()
+    data_D = preprocessing.StandardScaler().fit_transform(data.data)
+    data_L = data.target
+    X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7)
+    print(set(y_train))
+
+    #layer 1：多模型融合
+    classifiers = {
+            'KNN': SubClassifier().SelectModel(modelname="KNN"),
+            'rf': SubClassifier().SelectModel(modelname="RF"),
+            'svm':  SubClassifier().SelectModel(modelname="SVM"),
+            'GBDT':  SubClassifier().SelectModel(modelname="GBDT")
+        }
+    
+    meta_classifier = SubClassifier().SelectModel(modelname="RF")
+
+    stacking_clf = StackingClassifier(classifiers, meta_classifier, n_classes=10,n_folds=5)
+
+    stacking_clf.fit(X_train, y_train)
+    pred = stacking_clf.predict(X_test)
+
+    #模型评估
+    stacking_clf.performance(y_test, pred)
+    # 96.4228934817
\ No newline at end of file