From 25d15a06d0dd20807a9a2dfd9908a7dbc4450ab7 Mon Sep 17 00:00:00 2001 From: MRHC <1932405808@qq.com> Date: Sun, 28 Jan 2018 15:59:59 +0800 Subject: [PATCH 01/17] DBSCAN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加 DBSCAN的一种实现 --- DBSCAN/dbscan.py | 201 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 DBSCAN/dbscan.py diff --git a/DBSCAN/dbscan.py b/DBSCAN/dbscan.py new file mode 100644 index 0000000..3efc858 --- /dev/null +++ b/DBSCAN/dbscan.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# 实现了DBSCAN算法 + +__author__ = 'ZYC@BUPT' +import jieba +import os +import sys +import json +jieba.load_userdict("newword.dict") +sys.setdefaultencoding("utf-8") +from sklearn import feature_extraction +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.feature_extraction.text import CountVectorizer +import numpy as np +import matplotlib.pyplot as plt +import math +import time +UNCLASSIFIED = False +NOISE = 0 + +def Test2(rootDir): + for lists in os.listdir(rootDir): + path = os.path.join(rootDir, lists) + # print path.decode('gb2312') + if path.find(".txt")!=-1: + Participle(path) + if os.path.isdir(path): + Test2(path) + +def Participle(path): + try: + fp = open(path, "r") + ad = fp.readline().strip('\n') + na = fp.readline().strip('\n') + ti = fp.readline().strip('\n')#time + si = fp.readline().strip('\n') + cont = na+fp.read() + fp.close() + except IOError: + return 0 + + try: + insi = {} + insi['time'] = ti + print(ti) + insi['url'] = ad + insi['title'] = na + insi['site'] = si#decode("gb2312").encode("utf-8") + global fnum + global segcont + global doc + seg_list = jieba.lcut(cont, cut_all=False) + stline = "" + for word in seg_list: + if ((word in d) is False) and word != '\n': + stline = stline + " " + word + segcont.append(stline) + print (str(fnum) + " 分词") + doc[fnum] = insi + fnum = fnum + 1 + except UnicodeError: + return 0 + +def loadDataSet(splitChar=','): + dataSet = [] + global we + dataSet=we + del we + return dataSet + +def region_query(data, pointId, eps): + nPoints = data.shape[1] + seeds = [] + for i in range(nPoints): + if eps_neighbor(data[:, pointId], data[:, i], eps): + seeds.append(i) + return seeds + +def tstore(clusters,clusterNum):#测试使用 + global doc + fpath="./test_res/" + i=0 + wr=[] + while i<=clusterNum: + path=fpath+str(i)+".txt" + fp=open(path,'w') + wr.append(fp) + i+=1 + i=1 + for cl in clusters: + enstr="" + enstr=doc[i]['title']+doc[i]['url'] + wr[cl].write(enstr+'\n') + i+=1 + +def expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts): + seeds = region_query(data, pointId, eps) + if len(seeds) < minPts: # 不满足minPts条件的为噪声点 + clusterResult[pointId] = NOISE + return False + else: + clusterResult[pointId] = clusterId # 划分到该簇 + for seedId in seeds: + clusterResult[seedId] = clusterId + while len(seeds) > 0: # 扩张 + currentPoint = seeds[0] + queryResults = region_query(data, currentPoint, eps) + if len(queryResults) >= minPts: + for i in range(len(queryResults)): + resultPoint = queryResults[i] + if clusterResult[resultPoint] == UNCLASSIFIED: + seeds.append(resultPoint) + clusterResult[resultPoint] = clusterId + elif clusterResult[resultPoint] == NOISE: + clusterResult[resultPoint] = clusterId + seeds = seeds[1:] + + return True + +def dbscan(data, eps, minPts): + clusterId = 1 + nPoints = data.shape[1] + clusterResult = [UNCLASSIFIED] * nPoints + for pointId in range(nPoints): + # print "point :"+str(pointId) + point = data[:, pointId] + if clusterResult[pointId] == UNCLASSIFIED: + if expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts): + clusterId = clusterId + 1 + return clusterResult, clusterId - 1 + + +def eps_neighbor(a, b, eps): + dis=math.sqrt(np.power(a - b, 2).sum()) + print(dis) + return dis < eps + +def main(): + dataSet = loadDataSet(splitChar=',') + dataSet = np.mat(dataSet).transpose() + # print(dataSet) + clusters, clusterNum = dbscan(dataSet, 1.37, 5)################################# + print("cluster Numbers = ", clusterNum) + # print(clusters) + #store(clusters, clusterNum) + tstore(clusters, clusterNum) + + +def TFIDF(): + global segcont + global weight + vectorizer = CountVectorizer() + transformer = TfidfTransformer() + tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont)) + word = vectorizer.get_feature_names() # 所有文本的关键字 + weight = tfidf.toarray() # 对应的tfidf矩阵 + del segcont + + seg = [] + for i in range(len(weight)): + enstr = "" + for j in range(len(word)): + if weight[i][j] >= 0.1:##################################### + enstr = enstr + " " + word[j] + seg.append(enstr) + + del weight + vec = CountVectorizer() + tra = TfidfTransformer() + tidf = tra.fit_transform(vec.fit_transform(seg)) + wo = vec.get_feature_names() + we = tidf.toarray() + + global we + +def dbs(): + global fnum,doc,segcont,d + fnum = 1 + segcont = [] + doc = {} + stfp = open("stop.txt", "r") + stcont = stfp.read() + list_a = jieba.lcut(stcont, cut_all=False) + d = set([]) + for li in list_a: + d.add(li) + stfp.close() + Test2('./sou1') + TFIDF() + start = time.clock() + main() + end = time.clock() + print('finish all in %s' % str(end - start)) + +dbs() + + + + From f148034deac0847253b92e283768a3c45b8483ae Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 27 Jan 2019 23:22:44 +0800 Subject: [PATCH 02/17] add doc2vector --- draft/contentBasedRec.py | 5 + draft/data/artice1.txt | 2 + draft/data/article2.txt | 1 + draft/data/corpus_test/t1.txt | 2 + draft/data/corpus_test/t2.txt | 1 + draft/data/corpus_words/artice1.txt | 1 + draft/data/corpus_words/article2.txt | 1 + draft/doc2vector.py | 203 +++++++++++++++++++++++++++ 8 files changed, 216 insertions(+) create mode 100644 draft/data/artice1.txt create mode 100644 draft/data/article2.txt create mode 100644 draft/data/corpus_test/t1.txt create mode 100644 draft/data/corpus_test/t2.txt create mode 100644 draft/data/corpus_words/artice1.txt create mode 100644 draft/data/corpus_words/article2.txt create mode 100644 draft/doc2vector.py diff --git a/draft/contentBasedRec.py b/draft/contentBasedRec.py index 5bf333a..55e7b1d 100644 --- a/draft/contentBasedRec.py +++ b/draft/contentBasedRec.py @@ -30,3 +30,8 @@ from sklearn.metrics.pairwise import cosine_similarity cos = cosine_similarity(weight[0:1], weight) recommendations = cos[0].argsort()[-4:][::-1] + + +a = list(weight[0:1][0]) +aaa = pd.DataFrame(weight) +cosine_similarity(np.array([a]), aaa) \ No newline at end of file diff --git a/draft/data/artice1.txt b/draft/data/artice1.txt new file mode 100644 index 0000000..9769e91 --- /dev/null +++ b/draft/data/artice1.txt @@ -0,0 +1,2 @@ +然而,即使上述模型对词向量进行平均处理,我们仍然忽略了单词之间的排列顺序对情感分析的影响。即上述的word2vec只是基于词的维度进行"语义分析"的,而并不具有上下文的"语义分析"能力。 +作为一个处理可变长度文本的总结性方法,Quoc Le 和 Tomas Mikolov 提出了 Doc2Vec方法。除了增加一个段落向量以外,这个方法几乎等同于 Word2Vec。和 Word2Vec 一样,该模型也存在两种方法:Distributed Memory(DM) 和 Distributed Bag of Words(DBOW)。DM 试图在给定上下文和段落向量的情况下预测单词的概率。在一个句子或者文档的训练过程中,段落 ID 保持不变,共享着同一个段落向量。DBOW 则在仅给定段落向量的情况下预测段落中一组随机单词的概率。 \ No newline at end of file diff --git a/draft/data/article2.txt b/draft/data/article2.txt new file mode 100644 index 0000000..3d264a8 --- /dev/null +++ b/draft/data/article2.txt @@ -0,0 +1 @@ +训练过程中新增了paragraph id,即训练语料中每个句子都有一个唯一的id。paragraph id和普通的word一样,也是先映射成一个向量,即paragraph vector。paragraph vector与word vector的维数虽一样,但是来自于两个不同的向量空间。在之后的计算里,paragraph vector和word vector累加或者连接起来,作为输出层softmax的输入。在一个句子或者文档的训练过程中,paragraph id保持不变,共享着同一个paragraph vector,相当于每次在预测单词的概率时,都利用了整个句子的语义。 \ No newline at end of file diff --git a/draft/data/corpus_test/t1.txt b/draft/data/corpus_test/t1.txt new file mode 100644 index 0000000..9769e91 --- /dev/null +++ b/draft/data/corpus_test/t1.txt @@ -0,0 +1,2 @@ +然而,即使上述模型对词向量进行平均处理,我们仍然忽略了单词之间的排列顺序对情感分析的影响。即上述的word2vec只是基于词的维度进行"语义分析"的,而并不具有上下文的"语义分析"能力。 +作为一个处理可变长度文本的总结性方法,Quoc Le 和 Tomas Mikolov 提出了 Doc2Vec方法。除了增加一个段落向量以外,这个方法几乎等同于 Word2Vec。和 Word2Vec 一样,该模型也存在两种方法:Distributed Memory(DM) 和 Distributed Bag of Words(DBOW)。DM 试图在给定上下文和段落向量的情况下预测单词的概率。在一个句子或者文档的训练过程中,段落 ID 保持不变,共享着同一个段落向量。DBOW 则在仅给定段落向量的情况下预测段落中一组随机单词的概率。 \ No newline at end of file diff --git a/draft/data/corpus_test/t2.txt b/draft/data/corpus_test/t2.txt new file mode 100644 index 0000000..3d264a8 --- /dev/null +++ b/draft/data/corpus_test/t2.txt @@ -0,0 +1 @@ +训练过程中新增了paragraph id,即训练语料中每个句子都有一个唯一的id。paragraph id和普通的word一样,也是先映射成一个向量,即paragraph vector。paragraph vector与word vector的维数虽一样,但是来自于两个不同的向量空间。在之后的计算里,paragraph vector和word vector累加或者连接起来,作为输出层softmax的输入。在一个句子或者文档的训练过程中,paragraph id保持不变,共享着同一个paragraph vector,相当于每次在预测单词的概率时,都利用了整个句子的语义。 \ No newline at end of file diff --git a/draft/data/corpus_words/artice1.txt b/draft/data/corpus_words/artice1.txt new file mode 100644 index 0000000..4b65fdd --- /dev/null +++ b/draft/data/corpus_words/artice1.txt @@ -0,0 +1 @@ +然而 , 即使 上述 模型 对词 向量 进行 平均 处理 , 我们 仍然 忽略 了 单词 之间 的 排列 顺序 对 情感 分析 的 影响 。 即 上述 的 word2vec 只是 基于 词 的 维度 进行 " 语义 分析 " 的 , 而 并 不 具有 上下文 的 " 语义 分析 " 能力 。 作为 一个 处理 可变 长度 文本 的 总结性 方法 , QuocLe 和 TomasMikolov 提出 了 Doc2Vec 方法 。 除了 增加 一个 段落 向量 以外 , 这个 方法 几乎 等同于 Word2Vec 。 和 Word2Vec 一样 , 该 模型 也 存在 两种 方法 : DistributedMemory ( DM ) 和 DistributedBagofWords ( DBOW ) 。 DM 试图 在 给定 上下文 和 段落 向量 的 情况 下 预测 单词 的 概率 。 在 一个 句子 或者 文档 的 训练 过程 中 , 段落 ID 保持 不变 , 共享 着 同一个 段落 向量 。 DBOW 则 在 仅 给定 段落 向量 的 情况 下 预测 段落 中 一组 随机 单词 的 概率 。 然而 , 即使 上述 模型 对词 向量 进行 平均 处理 , 我们 仍然 忽略 了 单词 之间 的 排列 顺序 对 情感 分析 的 影响 。 即 上述 的 word2vec 只是 基于 词 的 维度 进行 " 语义 分析 " 的 , 而 并 不 具有 上下文 的 " 语义 分析 " 能力 。 作为 一个 处理 可变 长度 文本 的 总结性 方法 , QuocLe 和 TomasMikolov 提出 了 Doc2Vec 方法 。 除了 增加 一个 段落 向量 以外 , 这个 方法 几乎 等同于 Word2Vec 。 和 Word2Vec 一样 , 该 模型 也 存在 两种 方法 : DistributedMemory ( DM ) 和 DistributedBagofWords ( DBOW ) 。 DM 试图 在 给定 上下文 和 段落 向量 的 情况 下 预测 单词 的 概率 。 在 一个 句子 或者 文档 的 训练 过程 中 , 段落 ID 保持 不变 , 共享 着 同一个 段落 向量 。 DBOW 则 在 仅 给定 段落 向量 的 情况 下 预测 段落 中 一组 随机 单词 的 概率 。 \ No newline at end of file diff --git a/draft/data/corpus_words/article2.txt b/draft/data/corpus_words/article2.txt new file mode 100644 index 0000000..a34839c --- /dev/null +++ b/draft/data/corpus_words/article2.txt @@ -0,0 +1 @@ +训练 过程 中 新增 了 paragraphid , 即 训练 语料 中 每个 句子 都 有 一个 唯一 的 id 。 paragraphid 和 普通 的 word 一样 , 也 是 先 映射 成 一个 向量 , 即 paragraphvector 。 paragraphvector 与 wordvector 的 维数 虽 一样 , 但是 来自 于 两个 不同 的 向量 空间 。 在 之后 的 计算 里 , paragraphvector 和 wordvector 累加 或者 连接起来 , 作为 输出 层 softmax 的 输入 。 在 一个 句子 或者 文档 的 训练 过程 中 , paragraphid 保持 不变 , 共享 着 同一个 paragraphvector , 相当于 每次 在 预测 单词 的 概率 时 , 都 利用 了 整个 句子 的 语义 。 \ No newline at end of file diff --git a/draft/doc2vector.py b/draft/doc2vector.py new file mode 100644 index 0000000..0492268 --- /dev/null +++ b/draft/doc2vector.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Jan 27 14:20:16 2019 + +@author: liudiwei +""" + +# -*- coding: utf-8 -*- +import codecs +import re +from os import listdir + +import gensim +import jieba +import numpy as np +import pandas as pd +import os + +def doc_segment(doc_path, corpus_path): + """save word segment + """ + # 先把所有文档的路径存进一个 array 中,docLabels: + doc_lists = [file for file in listdir(doc_path) if file.endswith('.txt')] + + for doc in doc_lists: + try: + ws = codecs.open(doc_path + "/" + doc, encoding="utf8").read() + doc_words = segment(ws) + if not os.path.exists(corpus_path): + os.mkdir(corpus_path) + with codecs.open(corpus_path + "/{}".format(doc), "a", encoding="UTF-8") as f: + f.write(" ".join(doc_words)) + except: + print(doc) + +def segment(doc: str, stopword_file=None): + """中文分词 + parameter: + doc : str, input text + return: + [type] --- [description] + """ + # 停用词 + if stopword_file != None: + stop_words = pd.read_csv(stopword_file, + index_col=False, + quoting=3, + names=['stopword'], + sep="\n", + encoding='utf-8') + + stop_words = list(stop_words.stopword) + else: + stop_words = [] + reg_html = re.compile(r'<[^>]+>', re.S) + doc = reg_html.sub('', doc) + doc = re.sub('[0-9]', '', doc) + doc = re.sub('\s', '', doc) + word_list = list(jieba.cut(doc)) + out_str = '' + for word in word_list: + if word not in stop_words: + out_str += word + out_str += ' ' + segments = out_str.split(sep=" ") + return segments + +def build_corpus(corpus_path): + """build word corpus: list of list + """ + doc_labels = [f for f in os.listdir(corpus_path) if f.endswith('.txt')] + + corpus = [] + for doc in doc_labels: + ws = open(corpus_path + "/" + doc, 'r', encoding='UTF-8').read() + corpus.append(ws) + + print("corpus size: ", len(corpus)) + return corpus, doc_labels + +############################## build model #################################### + +def train_model(corpus, doc_labels, model_path, model_name="doc2vec.model"): + """training model + parameter: + - courpus: [[...], [....]] + - doc_labels: [...] + - model_path + - model_name: default value "doc2vec.model" + return: + - model: model + - model_file: model_path + "/" + model_name + """ + # training doc2vec model and save model to local disk: + sentences = LabeledLineSentence(corpus, doc_labels) + # an empty model + model = gensim.models.Doc2Vec(vector_size=256, + window=10, + min_count=5, + workers=4, + alpha=0.025, + min_alpha=0.025, + epochs=12) + model.build_vocab(sentences) + + print("start training...") + model.train(sentences, total_examples = model.corpus_count, epochs=12) + + if not os.path.exists(model_path): + os.mkdir(model_path) + model_file = model_path + "/" + model_name + model.save(model_file) + print("Model saved") + return model, model_file + +def test_model(model_file, file1, file2): + print("Loading Model.") + model = gensim.models.Doc2Vec.load(model_file) + + sentence1 = open(file1, 'r', encoding='UTF-8').read() + sentence2 = open(file2, 'r', encoding='UTF-8').read() + + # 分词 + print("start to segment") + words1 = segment(sentence1) + words2 = segment(sentence2) + + # 转成句子向量 + vector1 = sent2vec(model, words1) + vector2 = sent2vec(model, words2) + + import sys + print(sys.getsizeof(vector1)) + print(sys.getsizeof(vector2)) + + cos = similarity(vector1, vector2) + print("相似度:{:.4f}".format(cos)) + + +def similarity(a_vect, b_vect): + """计算两个向量余弦值 + parameter: + a_vect {[type]} -- a 向量 + b_vect {[type]} -- b 向量 + + return: + [type] -- [description] + """ + + dot_val = 0.0 + a_norm = 0.0 + b_norm = 0.0 + cos = None + for a, b in zip(a_vect, b_vect): + dot_val += a*b + a_norm += a**2 + b_norm += b**2 + if a_norm == 0.0 or b_norm == 0.0: + cos = -1 + else: + cos = dot_val / ((a_norm*b_norm)**0.5) + + return cos + + +def sent2vec(model, words): + """sentence2vector + parameter: + model {[type]} -- Doc2Vec 模型 + words {[type]} -- 分词后的文本 + return: + [type] -- 向量数组 + """ + vect_list = [] + for w in words: + try: + vect_list.append(model.wv[w]) + except: + continue + vect_list = np.array(vect_list) + vect = vect_list.sum(axis=0) + return vect / np.sqrt((vect ** 2).sum()) + + + +class LabeledLineSentence(object): + def __init__(self, doc_list, labels_list): + self.labels_list = labels_list + self.doc_list = doc_list + + def __iter__(self): + for idx, doc in enumerate(self.doc_list): + yield gensim.models.doc2vec.TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]]) + + +if __name__ == '__main__': + doc_path = "./data/" + corpus_path = "data/corpus_words" + model_path = "./models" + #doc_segment(data_dir) + corpus, doc_labels = build_corpus(corpus_path) + model, model_file = train_model(corpus, doc_labels, model_path) + test_model(model_file, './data/corpus_test/t2.txt', './data/corpus_test/t1.txt') From 053a428e5d5c7d171b05096b47ed50be87142da6 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 27 Jan 2019 23:32:13 +0800 Subject: [PATCH 03/17] update readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index a74bc1b..f9e0e34 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,9 @@ ## Contributor -- 刘帝伟, 中南大学2014级硕士,[HomePage](http://www.csuldw.com). +- 刘帝伟, 中南大学14级硕士,关注AI、机器学习、深度学习方向,[HomePage](http://www.csuldw.com). ## Contact -- QQ: 466454368 - E-mail: csu.ldw@csu.edu.cn From f0985fc6125abefdb4888e24e411ec728a2848ac Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 27 Jan 2019 23:44:39 +0800 Subject: [PATCH 04/17] add fm_demo --- draft/jupyter/fm_demo.ipynb | 555 ++++++++++++++++++++++++++++++++++++ 1 file changed, 555 insertions(+) create mode 100644 draft/jupyter/fm_demo.ipynb diff --git a/draft/jupyter/fm_demo.ipynb b/draft/jupyter/fm_demo.ipynb new file mode 100644 index 0000000..f288396 --- /dev/null +++ b/draft/jupyter/fm_demo.ipynb @@ -0,0 +1,555 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.chdir(r\"F:\\CSU\\Github\\nlp_exp\\fm_exp\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from itertools import count \n", + "from collections import defaultdict\n", + "from scipy.sparse import csr \n", + "from __future__ import print_function \n", + "\n", + "def vectorize_dic(dic, ix=None, p=None):\n", + " \"\"\" \n", + " Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) \n", + " \n", + " parameters:\n", + " -----------\n", + " dic -- dictionary of feature lists. Keys are the name of features\n", + " ix -- index generator (default None)\n", + " p -- dimension of featrure space (number of columns in the sparse matrix) (default None)\n", + " \"\"\"\n", + " if (ix == None):\n", + " d = count(0)\n", + " ix = defaultdict(lambda: next(d)) \n", + " \n", + " n = len(list(dic.values())[0]) # num samples\n", + " g = len(list(dic.keys())) # num groups\n", + " nz = n * g # number of non-zeros\n", + "\n", + " col_ix = np.empty(nz, dtype=int) \n", + " \n", + " i = 0\n", + " for k, lis in dic.items(): \n", + " # append index el with k in order to prevet mapping different columns with same id to same index\n", + " col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]\n", + " i += 1\n", + " \n", + " row_ix = np.repeat(np.arange(0, n), g) \n", + " data = np.ones(nz)\n", + " \n", + " if (p == None):\n", + " p = len(ix)\n", + " \n", + " ixx = np.where(col_ix < p)\n", + "\n", + " return csr.csr_matrix((data[ixx],(row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 加载数据集" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
useritemratingtimestamp
0115874965758
1123876893171
2134878542960
3143876893119
4153889751712
5165887431973
6174875071561
7181875072484
8195878543541
91103875693118
\n", + "
" + ], + "text/plain": [ + " user item rating timestamp\n", + "0 1 1 5 874965758\n", + "1 1 2 3 876893171\n", + "2 1 3 4 878542960\n", + "3 1 4 3 876893119\n", + "4 1 5 3 889751712\n", + "5 1 6 5 887431973\n", + "6 1 7 4 875071561\n", + "7 1 8 1 875072484\n", + "8 1 9 5 878543541\n", + "9 1 10 3 875693118" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.feature_extraction import DictVectorizer\n", + "\n", + "# laod data with pandas\n", + "cols = ['user', 'item', 'rating', 'timestamp']\n", + "train = pd.read_csv('data/ua.base', delimiter='\\t', names=cols)\n", + "test = pd.read_csv('data/ua.test', delimiter='\\t', names=cols)\n", + "train.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### vectorize data and convert them to csr matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, ix = vectorize_dic({'users': train.user.values, 'items': train.item.values})\n", + "X_test, ix = vectorize_dic({'users': test.user.values, 'items': test.item.values}, ix, X_train.shape[1])\n", + "y_train = train.rating.values\n", + "y_test= test.rating.values" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(90570, 2623)\n", + "(9430, 2623)\n" + ] + } + ], + "source": [ + "X_train = X_train.todense()\n", + "X_test = X_test.todense()\n", + "\n", + "# print shape of data\n", + "print(X_train.shape)\n", + "print(X_test.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define FM Model with tensorflow" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "\n", + "#get x_train shape\n", + "n, p = X_train.shape\n", + "\n", + "# number of latent factors\n", + "k = 10\n", + "\n", + "# design matrix\n", + "X = tf.placeholder('float', shape=[None, p])\n", + "# target vector\n", + "y = tf.placeholder('float', shape=[None, 1])\n", + "\n", + "# bias and weights\n", + "w0 = tf.Variable(tf.zeros([1]))\n", + "W = tf.Variable(tf.zeros([p]))\n", + "\n", + "# interaction factors, randomly initialized \n", + "V = tf.Variable(tf.random_normal([k, p], stddev=0.01))\n", + "\n", + "# estimate of y, initialized to 0.\n", + "y_hat = tf.Variable(tf.zeros([n, 1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/latex": [ + "$$\\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)$$" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display, Math, Latex\n", + "\n", + "display(Math(r'\\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)'))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate output with FM equation\n", + "linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, X), 1, keep_dims=True))\n", + "pair_interactions = (tf.multiply(0.5,\n", + " tf.reduce_sum(\n", + " tf.subtract(\n", + " tf.pow( tf.matmul(X, tf.transpose(V)), 2),\n", + " tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(V, 2)))),\n", + " 1, keep_dims=True)))\n", + "y_hat = tf.add(linear_terms, pair_interactions)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/latex": [ + "$$L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2$$" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Math(r'L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# L2 regularized sum of squares loss function over W and V\n", + "lambda_w = tf.constant(0.001, name='lambda_w')\n", + "lambda_v = tf.constant(0.001, name='lambda_v')\n", + "\n", + "l2_norm = (tf.reduce_sum(\n", + " tf.add(\n", + " tf.multiply(lambda_w, tf.pow(W, 2)),\n", + " tf.multiply(lambda_v, tf.pow(V, 2)))))\n", + "\n", + "error = tf.reduce_mean(tf.square(tf.subtract(y, y_hat)))\n", + "loss = tf.add(error, l2_norm)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/latex": [ + "$$\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}$$" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Math(r'\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "def batcher(X_, y_=None, batch_size=-1):\n", + " n_samples = X_.shape[0]\n", + "\n", + " if batch_size == -1:\n", + " batch_size = n_samples\n", + " if batch_size < 1:\n", + " raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))\n", + "\n", + " for i in range(0, n_samples, batch_size):\n", + " upper_bound = min(i + batch_size, n_samples)\n", + " ret_x = X_[i:upper_bound]\n", + " ret_y = None\n", + " if y_ is not None:\n", + " ret_y = y_[i:i + batch_size]\n", + " yield (ret_x, ret_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a8247d26eae64d09b463eab3875f13c5", + "version_major": 2, + "version_minor": 0 + }, + "text/html": [ + "

Failed to display Jupyter Widget of type HBox.

\n", + "

\n", + " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", + " that the widgets JavaScript is still loading. If this message persists, it\n", + " likely means that the widgets JavaScript library is either not installed or\n", + " not enabled. See the Jupyter\n", + " Widgets Documentation for setup instructions.\n", + "

\n", + "

\n", + " If you're reading this message in another frontend (for example, a static\n", + " rendering on GitHub or NBViewer),\n", + " it may mean that your frontend doesn't currently support widgets.\n", + "

\n" + ], + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from tqdm import tqdm_notebook as tqdm\n", + "\n", + "epochs = 10\n", + "batch_size = 1000\n", + "\n", + "# Launch the graph\n", + "init = tf.global_variables_initializer()\n", + "sess = tf.Session()\n", + "\n", + "sess.run(init)\n", + "\n", + "for epoch in tqdm(range(epochs), unit='epoch'):\n", + " perm = np.random.permutation(X_train.shape[0])\n", + " # iterate over batches\n", + " for bX, bY in batcher(X_train[perm], y_train[perm], batch_size):\n", + " sess.run(optimizer, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.113785\n" + ] + } + ], + "source": [ + "errors = []\n", + "for bX, bY in batcher(X_test, y_test):\n", + " errors.append(sess.run(error, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)}))\n", + "\n", + "RMSE = np.sqrt(np.array(errors).mean())\n", + "print(RMSE)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "sess.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1.2405171]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "errors" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 0f1a7ff8bbebdc925462e37b67e927d23e555a02 Mon Sep 17 00:00:00 2001 From: csuldw Date: Mon, 28 Jan 2019 00:25:48 +0800 Subject: [PATCH 05/17] add pyfm_demo --- draft/pyfm_demo.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 draft/pyfm_demo.py diff --git a/draft/pyfm_demo.py b/draft/pyfm_demo.py new file mode 100644 index 0000000..5c22b1a --- /dev/null +++ b/draft/pyfm_demo.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Jan 27 23:49:24 2019 +pyfm安装:将https://github.com/coreylynch/pyFM 下载到本地,去掉setup.py里面的 +libraries=["m"],然后安装即可. +@author: liudiwei +""" +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from pyfm import pylibfm +from sklearn.feature_extraction import DictVectorizer + +iris_data = load_iris() +X = iris_data['data'] +y = iris_data['target'] == 2 +data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X] + +X_train,X_test,y_train, y_test = train_test_split(data,y,test_size=0.3,random_state=0) + +v = DictVectorizer() +X_train = v.fit_transform(X_train) +X_test = v.transform(X_test) + +fm = pylibfm.FM(num_factors=50, + num_iter=1000, + verbose=True, + task="classification", + initial_learning_rate=0.0001, + learning_rate_schedule="optimal") + +fm.fit(X_train, y_train) + + +y_preds = fm.predict(X_test) +from sklearn.metrics import log_loss +print ("Validation log loss: %.4f" % log_loss(y_test, y_preds)) + From 49cb0eca8b5c881ddf6f80d3910edf47a173bc4d Mon Sep 17 00:00:00 2001 From: csuldw Date: Mon, 28 Jan 2019 23:56:34 +0800 Subject: [PATCH 06/17] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f9e0e34..6ed389f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## MachineLearning -机器学习算法代码及个人总结整理,对于算法实现部分,在相应目录中都包含有源码和数据以及测试实例,内容正在不断完善中!如有错误,望不吝指教。PS:所有代码均符合我们整理出来的这份[编码规范](https://github.com/csuldw/MachineLearning/blob/master/Python-coding-standards.md). +机器学习算法代码及个人总结整理,对于算法实现部分,在相应目录中都包含有源码和数据以及测试实例,内容正在不断完善中!如有错误,还望读者指出,非常感谢,若您觉得对你有帮助,可以在右上角给个star哈(#^.^#)。PS:所有代码均符合我们整理出来的这份[编码规范](https://github.com/csuldw/MachineLearning/blob/master/Python-coding-standards.md). ## Contents From baacdbe9d9a17b302b0ef177e53b4b2a89bee4ed Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 17 Feb 2019 00:36:01 +0800 Subject: [PATCH 07/17] add rs --- {draft => Recommendation System}/data_process/user_keywords.csv | 0 {draft => Recommendation System}/recommend.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {draft => Recommendation System}/data_process/user_keywords.csv (100%) rename {draft => Recommendation System}/recommend.py (100%) diff --git a/draft/data_process/user_keywords.csv b/Recommendation System/data_process/user_keywords.csv similarity index 100% rename from draft/data_process/user_keywords.csv rename to Recommendation System/data_process/user_keywords.csv diff --git a/draft/recommend.py b/Recommendation System/recommend.py similarity index 100% rename from draft/recommend.py rename to Recommendation System/recommend.py From b89e2af410e7cc86303f89eaa037b48564219c19 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 17 Feb 2019 19:32:31 +0800 Subject: [PATCH 08/17] move pyFM demo --- Recommendation System/pyfm_demo.py | 47 ++++++++++++++++++++++++++++++ draft/pyfm_demo.py | 37 ----------------------- 2 files changed, 47 insertions(+), 37 deletions(-) create mode 100644 Recommendation System/pyfm_demo.py delete mode 100644 draft/pyfm_demo.py diff --git a/Recommendation System/pyfm_demo.py b/Recommendation System/pyfm_demo.py new file mode 100644 index 0000000..3787809 --- /dev/null +++ b/Recommendation System/pyfm_demo.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Jan 27 23:49:24 2019 + +pyfm install:将https://github.com/coreylynch/pyFM 下载到本地,去掉setup.py里面的 +libraries=["m"],然后安装即可. + +@author: liudiwei +""" +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from pyfm import pylibfm +from sklearn.feature_extraction import DictVectorizer + +def load_data(): + """ + 调用sklearn的iris数据集,筛选正负样本并构造切分训练测试数据集 + """ + iris_data = load_iris() + X = iris_data['data'] + y = iris_data['target'] == 2 + data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X] + + X_train,X_test,y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=0) + return X_train,X_test,y_train, y_test + +X_train,X_test,y_train, y_test = load_data() + +v = DictVectorizer() +X_train = v.fit_transform(X_train) +X_test = v.transform(X_test) + +fm = pylibfm.FM(num_factors=100, + num_iter=200, + verbose=True, + task="classification", + initial_learning_rate=0.001, + learning_rate_schedule="optimal") + +fm.fit(X_train, y_train) + +y_preds = fm.predict(X_test) +y_preds_label = y_preds > 0.5 +from sklearn.metrics import log_loss,accuracy_score +print ("Validation log loss: %.4f" % log_loss(y_test, y_preds)) +print ("accuracy: %.4f" % accuracy_score(y_test, y_preds_label)) + diff --git a/draft/pyfm_demo.py b/draft/pyfm_demo.py deleted file mode 100644 index 5c22b1a..0000000 --- a/draft/pyfm_demo.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Sun Jan 27 23:49:24 2019 -pyfm安装:将https://github.com/coreylynch/pyFM 下载到本地,去掉setup.py里面的 -libraries=["m"],然后安装即可. -@author: liudiwei -""" -from sklearn.datasets import load_iris -from sklearn.model_selection import train_test_split -from pyfm import pylibfm -from sklearn.feature_extraction import DictVectorizer - -iris_data = load_iris() -X = iris_data['data'] -y = iris_data['target'] == 2 -data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X] - -X_train,X_test,y_train, y_test = train_test_split(data,y,test_size=0.3,random_state=0) - -v = DictVectorizer() -X_train = v.fit_transform(X_train) -X_test = v.transform(X_test) - -fm = pylibfm.FM(num_factors=50, - num_iter=1000, - verbose=True, - task="classification", - initial_learning_rate=0.0001, - learning_rate_schedule="optimal") - -fm.fit(X_train, y_train) - - -y_preds = fm.predict(X_test) -from sklearn.metrics import log_loss -print ("Validation log loss: %.4f" % log_loss(y_test, y_preds)) - From 95802acb59603ac1bc3885e3db7d50715e58bca3 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 17 Feb 2019 19:39:07 +0800 Subject: [PATCH 09/17] update pyFM demo --- Recommendation System/pyfm_demo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Recommendation System/pyfm_demo.py b/Recommendation System/pyfm_demo.py index 3787809..1ea5d9e 100644 --- a/Recommendation System/pyfm_demo.py +++ b/Recommendation System/pyfm_demo.py @@ -30,8 +30,8 @@ def load_data(): X_train = v.fit_transform(X_train) X_test = v.transform(X_test) -fm = pylibfm.FM(num_factors=100, - num_iter=200, +fm = pylibfm.FM(num_factors=1, + num_iter=500, verbose=True, task="classification", initial_learning_rate=0.001, From 66bffed11fadef19b2d4996c5ef01b8de31cd51c Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 17 Feb 2019 20:02:01 +0800 Subject: [PATCH 10/17] update pyFM demo --- Recommendation System/pyfm_demo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Recommendation System/pyfm_demo.py b/Recommendation System/pyfm_demo.py index 1ea5d9e..7c634f5 100644 --- a/Recommendation System/pyfm_demo.py +++ b/Recommendation System/pyfm_demo.py @@ -30,8 +30,8 @@ def load_data(): X_train = v.fit_transform(X_train) X_test = v.transform(X_test) -fm = pylibfm.FM(num_factors=1, - num_iter=500, +fm = pylibfm.FM(num_factors=2, + num_iter=200, verbose=True, task="classification", initial_learning_rate=0.001, From 897267b8609d6bc6b5815a3eb80c5f9f8ccb197f Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:26:19 +0800 Subject: [PATCH 11/17] add wx photo --- .DS_Store | Bin 0 -> 6148 bytes README.md | 8 +++++++- ...xperiments with a New Boosting Algorithm.pdf} | Bin 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 .DS_Store rename doc/{boostingexperiments.pdf => 1996 Experiments with a New Boosting Algorithm.pdf} (100%) diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..bffc98b573f7e579f5881366afafadada05bde95 GIT binary patch literal 6148 zcmeHK%}N6?5T3ME)2i5mpx*Z4tq0eC55iLG!JDw62Nm75i!RiSbhj3*m3!kIdeM-HUn%KAZ1yWFRDwu-iEtXb5?h@`!%QMx?9*;U)${W%W`4a-95OxeR%9W^`Bo1%>utj zEprwJ@QTKdJHGeZNlPWy7&FKp%m6bmJ`C6!PEL={mUt)305kAQ4AA}Hpb~l( zbA$Toz=nPwX}m;8f;PP+2px-_#oQo{pa_$SXi|lJVhEFte#gdn7IT9p9fV#P=dmjb z`$7?Vb@V$r9E4|(TV{Y67-pbox>c(G`#-<`hm*L+3@`&@#egU^{6-zOWNYil=BU;> tsF$cD6qg(POhH4JVvMCyyojm={SFz3p2gfCdQkXBK-0htGw`Pjd;qX|ReS&d literal 0 HcmV?d00001 diff --git a/README.md b/README.md index 6ed389f..b158257 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,15 @@ ## Contributor -- 刘帝伟, 中南大学14级硕士,关注AI、机器学习、深度学习方向,[HomePage](http://www.csuldw.com). +- 刘帝伟, CSU硕士毕业,关注AI、机器学习、深度学习方向,[HomePage](http://www.csuldw.com). ## Contact +如果有任何疑问,可在我的微信公众号后台留言: + +![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) + +或是发邮件吧: + - E-mail: csu.ldw@csu.edu.cn diff --git a/doc/boostingexperiments.pdf b/doc/1996 Experiments with a New Boosting Algorithm.pdf similarity index 100% rename from doc/boostingexperiments.pdf rename to doc/1996 Experiments with a New Boosting Algorithm.pdf From f9cf3f2582d1bfbb18dcc25bc1b1d1a67822a8e9 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:28:33 +0800 Subject: [PATCH 12/17] add wx photo --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index b158257..5b452ae 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,9 @@ ![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) + +
Fig 1:推荐系统整体结构.
+ 或是发邮件吧: - E-mail: csu.ldw@csu.edu.cn From 44cf0c0f576e550ff6f17ba854ff71c82f9b4ebb Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:29:40 +0800 Subject: [PATCH 13/17] add wx photo --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5b452ae..0f9f865 100644 --- a/README.md +++ b/README.md @@ -45,10 +45,10 @@ 如果有任何疑问,可在我的微信公众号后台留言: -![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) - + +
-
Fig 1:推荐系统整体结构.
+
或是发邮件吧: From 97c5aa0811e58f8ff635797b2382fa9949f01491 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:31:08 +0800 Subject: [PATCH 14/17] add wx photo --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f9f865..a47b09c 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,10 @@ 如果有任何疑问,可在我的微信公众号后台留言: -
+ +
-
+ 或是发邮件吧: From a93595a55f860fcc91acbc35d40836d275cdcee7 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:32:12 +0800 Subject: [PATCH 15/17] add wx photo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a47b09c..7831611 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@
- +
或是发邮件吧: From 9a5daa063034878b7fd7c594a48aed0fc8a23606 Mon Sep 17 00:00:00 2001 From: csuldw Date: Wed, 16 Oct 2019 00:32:12 +0800 Subject: [PATCH 16/17] add stacking model draft --- draft/stacking.py | 181 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 draft/stacking.py diff --git a/draft/stacking.py b/draft/stacking.py new file mode 100644 index 0000000..915d778 --- /dev/null +++ b/draft/stacking.py @@ -0,0 +1,181 @@ +from sklearn.model_selection import KFold +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_digits +import numpy as np +from sklearn.svm import SVC +from sklearn import metrics +from sklearn.ensemble import RandomForestClassifier +from sklearn import preprocessing +import pandas as pd +from functools import reduce +from sklearn.metrics import confusion_matrix, classification_report + +class StackingClassifier(object): + + def __init__(self, modellist=[], meta_classifier=None): + self.modellist = modellist + if meta_classifier == None: + from sklearn.linear_model import LogisticRegression + meta_classifier = LogisticRegression() + self.meta_classifier = meta_classifier + + def SelectModel(self, modelname): + + if modelname == "SVM": + from sklearn.svm import SVC + model = SVC(kernel='rbf', C=16, gamma=0.125,probability=True) + + elif modelname == "lr": + from sklearn.linear_model import LogisticRegression + model = LogisticRegression() + + elif modelname == "GBDT": + from sklearn.ensemble import GradientBoostingClassifier + model = GradientBoostingClassifier() + + elif modelname == "RF": + from sklearn.ensemble import RandomForestClassifier + model = RandomForestClassifier() + + elif modelname == "xgboost": + from xgboost import XGBClassifier + model = XGBClassifier( + learning_rate=0.01, + n_estimators=1000, + max_depth=4, + min_child_weight=3, + gamma=0.1, + subsample=0.8, + colsample_bytree=0.8, + reg_alpha=1, + objective='binary:logistic', #multi:softmax + nthread=8, + scale_pos_weight=1, + seed=27, + random_state=27 + ) + elif modelname == "KNN": + from sklearn.neighbors import KNeighborsClassifier as knn + model = knn() + + elif modelname == "MNB": + from sklearn.naive_bayes import MultinomialNB + model = MultinomialNB() + else: + pass + return model + + def get_oof(self, clf, n_folds, X_train, y_train, X_test): + ntrain = X_train.shape[0] + ntest = X_test.shape[0] + print("kfolds: ", ntrain, ntest) + classnum = len(np.unique(y_train)) + kf = KFold(n_splits=n_folds,random_state=1) + oof_train = np.zeros((ntrain,classnum)) + oof_test = np.zeros((ntest,classnum)) + + for i,(train_index, test_index) in enumerate(kf.split(X_train)): + kf_X_train = X_train[train_index] # 数据 + kf_y_train = y_train[train_index] # 标签 + + kf_X_test = X_train[test_index] # k-fold的验证集 + + clf.fit(kf_X_train, kf_y_train) + oof_train[test_index] = clf.predict_proba(kf_X_test) + # print("shape of oof_train:", oof_train[test_index].shape) + + print("fold{i}: oof_train: {a}, oof_test:{b}".format(i=i, a=oof_train.shape, b=oof_test.shape)) + oof_test += clf.predict_proba(X_test) + oof_test = oof_test/float(n_folds) + print("oof_train: {a}, oof_test:{b}".format(a=oof_train.shape, b=oof_test.shape)) + return oof_train, oof_test + + def first_layer(self, X_train, y_train, X_test, modellist=None): + """modellist 需要重新修改 + """ + newfeature_list = [] + newtestdata_list = [] + for modelname in self.modellist: + sub_clf = self.SelectModel(modelname) + oof_train_, oof_test_= self.get_oof(clf=sub_clf, + n_folds=5, + X_train=X_train, + y_train=y_train, + X_test=X_test) + print("oof_train: ", oof_train_.shape) + print("model-{}".format(modelname),len(oof_train_), len(oof_test_)) + newfeature_list.append(oof_train_) + print("newfeature_list: ", len(newfeature_list)) + newtestdata_list.append(oof_test_) + + # 特征组合 + X_train_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newfeature_list) + X_test_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newtestdata_list) + + return X_train_stacking, X_test_stacking + + def fit(self, X_train, y_train, clf=None): + if clf != None: + self.meta_classifier = clf + self.meta_classifier.fit(X_train, y_train) + return self.meta_classifier + + #second_layer + def second_layer(self, X_train, y_train, clf=None): + return self.fit(X_train, y_train, clf) + + def predict(self, X_test, clf=None, type="label"): + if clf == None: + clf = self.meta_classifier + if type == "proba": + return clf.predict_proba(X_test) + elif type == "label": + return clf.predict(X_test) + + def get_accuracy(self, y_true, y_pred): + accuracy = metrics.accuracy_score(y_true, y_pred)*100 + return accuracy + + def performance(self, y_true, y_pred): + accuracy = self.get_accuracy(y_true, y_pred) + confusion = confusion_matrix(y_true, y_pred) + report = classification_report(y_true, y_pred) + print("多模型融合预测accuracy:{}".format(accuracy)) + print("混淆矩阵:\n{}".format(confusion)) + print("预测结果:\n{}".format(report)) + return confusion, report + + +# 使用stacking方法的时候 +# 第一级,重构特征当做第二级的训练集 +if __name__ == "__main__": + # 导入数据集切割训练与测试数据 + data = load_digits() + data_D = preprocessing.StandardScaler().fit_transform(data.data) + data_L = data.target + X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7) + print(set(y_train)) + + # 单纯使用一个分类器的时候 + clf_meta = RandomForestClassifier() + clf_meta.fit(X_train, y_train) + pred = clf_meta.predict(X_test) + accuracy = metrics.accuracy_score(y_test, pred)*100 + print ("====================", accuracy) + # 91.0969793323 + + #layer 1:多模型融合 + modelist = ['SVM', 'GBDT', 'RF', 'KNN'] + stacking_clf = StackingClassifier(modelist) + X_train_stacking, X_test_stacking = stacking_clf.first_layer(X_train, y_train, X_test) + print("shape of X_train_stacking {}".format(X_train_stacking.shape)) + print("shape of X_test_stacking {}".format(X_test_stacking.shape)) + + #layer 2: 单模型训练 + RF = stacking_clf.SelectModel(modelname="RF") + clf = stacking_clf.second_layer(X_train_stacking, y_train, clf=RF) + pred = stacking_clf.predict(X_test_stacking) + + #模型评估 + stacking_clf.performance(y_test, pred) + # 96.4228934817 From 5f1569ad6a3ca015b8c07f76a19e8b44462bb6cd Mon Sep 17 00:00:00 2001 From: csuldw Date: Sat, 19 Oct 2019 21:52:26 +0800 Subject: [PATCH 17/17] add stacking --- .DS_Store | Bin 6148 -> 6148 bytes stacking/stacking.py | 241 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 stacking/stacking.py diff --git a/.DS_Store b/.DS_Store index bffc98b573f7e579f5881366afafadada05bde95..bc267768983cb470703ff7c8ba16d687b7b1fb40 100644 GIT binary patch delta 51 zcmZoMXfc@J&&a$nU^gQp^W=X_GMhD+7#RgQ7>XH67!nzh8L}BN8S)s?H%l>}W!cQm H@s}R}WbzGn delta 38 ucmZoMXfc@J&&aefU^nAr0}+