diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..bc26776 Binary files /dev/null and b/.DS_Store differ diff --git a/DBSCAN/dbscan.py b/DBSCAN/dbscan.py new file mode 100644 index 0000000..3efc858 --- /dev/null +++ b/DBSCAN/dbscan.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# 实现了DBSCAN算法 + +__author__ = 'ZYC@BUPT' +import jieba +import os +import sys +import json +jieba.load_userdict("newword.dict") +sys.setdefaultencoding("utf-8") +from sklearn import feature_extraction +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.feature_extraction.text import CountVectorizer +import numpy as np +import matplotlib.pyplot as plt +import math +import time +UNCLASSIFIED = False +NOISE = 0 + +def Test2(rootDir): + for lists in os.listdir(rootDir): + path = os.path.join(rootDir, lists) + # print path.decode('gb2312') + if path.find(".txt")!=-1: + Participle(path) + if os.path.isdir(path): + Test2(path) + +def Participle(path): + try: + fp = open(path, "r") + ad = fp.readline().strip('\n') + na = fp.readline().strip('\n') + ti = fp.readline().strip('\n')#time + si = fp.readline().strip('\n') + cont = na+fp.read() + fp.close() + except IOError: + return 0 + + try: + insi = {} + insi['time'] = ti + print(ti) + insi['url'] = ad + insi['title'] = na + insi['site'] = si#decode("gb2312").encode("utf-8") + global fnum + global segcont + global doc + seg_list = jieba.lcut(cont, cut_all=False) + stline = "" + for word in seg_list: + if ((word in d) is False) and word != '\n': + stline = stline + " " + word + segcont.append(stline) + print (str(fnum) + " 分词") + doc[fnum] = insi + fnum = fnum + 1 + except UnicodeError: + return 0 + +def loadDataSet(splitChar=','): + dataSet = [] + global we + dataSet=we + del we + return dataSet + +def region_query(data, pointId, eps): + nPoints = data.shape[1] + seeds = [] + for i in range(nPoints): + if eps_neighbor(data[:, pointId], data[:, i], eps): + seeds.append(i) + return seeds + +def tstore(clusters,clusterNum):#测试使用 + global doc + fpath="./test_res/" + i=0 + wr=[] + while i<=clusterNum: + path=fpath+str(i)+".txt" + fp=open(path,'w') + wr.append(fp) + i+=1 + i=1 + for cl in clusters: + enstr="" + enstr=doc[i]['title']+doc[i]['url'] + wr[cl].write(enstr+'\n') + i+=1 + +def expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts): + seeds = region_query(data, pointId, eps) + if len(seeds) < minPts: # 不满足minPts条件的为噪声点 + clusterResult[pointId] = NOISE + return False + else: + clusterResult[pointId] = clusterId # 划分到该簇 + for seedId in seeds: + clusterResult[seedId] = clusterId + while len(seeds) > 0: # 扩张 + currentPoint = seeds[0] + queryResults = region_query(data, currentPoint, eps) + if len(queryResults) >= minPts: + for i in range(len(queryResults)): + resultPoint = queryResults[i] + if clusterResult[resultPoint] == UNCLASSIFIED: + seeds.append(resultPoint) + clusterResult[resultPoint] = clusterId + elif clusterResult[resultPoint] == NOISE: + clusterResult[resultPoint] = clusterId + seeds = seeds[1:] + + return True + +def dbscan(data, eps, minPts): + clusterId = 1 + nPoints = data.shape[1] + clusterResult = [UNCLASSIFIED] * nPoints + for pointId in range(nPoints): + # print "point :"+str(pointId) + point = data[:, pointId] + if clusterResult[pointId] == UNCLASSIFIED: + if expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts): + clusterId = clusterId + 1 + return clusterResult, clusterId - 1 + + +def eps_neighbor(a, b, eps): + dis=math.sqrt(np.power(a - b, 2).sum()) + print(dis) + return dis < eps + +def main(): + dataSet = loadDataSet(splitChar=',') + dataSet = np.mat(dataSet).transpose() + # print(dataSet) + clusters, clusterNum = dbscan(dataSet, 1.37, 5)################################# + print("cluster Numbers = ", clusterNum) + # print(clusters) + #store(clusters, clusterNum) + tstore(clusters, clusterNum) + + +def TFIDF(): + global segcont + global weight + vectorizer = CountVectorizer() + transformer = TfidfTransformer() + tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont)) + word = vectorizer.get_feature_names() # 所有文本的关键字 + weight = tfidf.toarray() # 对应的tfidf矩阵 + del segcont + + seg = [] + for i in range(len(weight)): + enstr = "" + for j in range(len(word)): + if weight[i][j] >= 0.1:##################################### + enstr = enstr + " " + word[j] + seg.append(enstr) + + del weight + vec = CountVectorizer() + tra = TfidfTransformer() + tidf = tra.fit_transform(vec.fit_transform(seg)) + wo = vec.get_feature_names() + we = tidf.toarray() + + global we + +def dbs(): + global fnum,doc,segcont,d + fnum = 1 + segcont = [] + doc = {} + stfp = open("stop.txt", "r") + stcont = stfp.read() + list_a = jieba.lcut(stcont, cut_all=False) + d = set([]) + for li in list_a: + d.add(li) + stfp.close() + Test2('./sou1') + TFIDF() + start = time.clock() + main() + end = time.clock() + print('finish all in %s' % str(end - start)) + +dbs() + + + + diff --git a/README.md b/README.md index a74bc1b..7831611 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## MachineLearning -机器学习算法代码及个人总结整理,对于算法实现部分,在相应目录中都包含有源码和数据以及测试实例,内容正在不断完善中!如有错误,望不吝指教。PS:所有代码均符合我们整理出来的这份[编码规范](https://github.com/csuldw/MachineLearning/blob/master/Python-coding-standards.md). +机器学习算法代码及个人总结整理,对于算法实现部分,在相应目录中都包含有源码和数据以及测试实例,内容正在不断完善中!如有错误,还望读者指出,非常感谢,若您觉得对你有帮助,可以在右上角给个star哈(#^.^#)。PS:所有代码均符合我们整理出来的这份[编码规范](https://github.com/csuldw/MachineLearning/blob/master/Python-coding-standards.md). ## Contents @@ -38,10 +38,19 @@ ## Contributor -- 刘帝伟, 中南大学2014级硕士,[HomePage](http://www.csuldw.com). +- 刘帝伟, CSU硕士毕业,关注AI、机器学习、深度学习方向,[HomePage](http://www.csuldw.com). ## Contact -- QQ: 466454368 +如果有任何疑问,可在我的微信公众号后台留言: + + + +
+| \n", + " | user | \n", + "item | \n", + "rating | \n", + "timestamp | \n", + "
|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "1 | \n", + "5 | \n", + "874965758 | \n", + "
| 1 | \n", + "1 | \n", + "2 | \n", + "3 | \n", + "876893171 | \n", + "
| 2 | \n", + "1 | \n", + "3 | \n", + "4 | \n", + "878542960 | \n", + "
| 3 | \n", + "1 | \n", + "4 | \n", + "3 | \n", + "876893119 | \n", + "
| 4 | \n", + "1 | \n", + "5 | \n", + "3 | \n", + "889751712 | \n", + "
| 5 | \n", + "1 | \n", + "6 | \n", + "5 | \n", + "887431973 | \n", + "
| 6 | \n", + "1 | \n", + "7 | \n", + "4 | \n", + "875071561 | \n", + "
| 7 | \n", + "1 | \n", + "8 | \n", + "1 | \n", + "875072484 | \n", + "
| 8 | \n", + "1 | \n", + "9 | \n", + "5 | \n", + "878543541 | \n", + "
| 9 | \n", + "1 | \n", + "10 | \n", + "3 | \n", + "875693118 | \n", + "
Failed to display Jupyter Widget of type HBox.
\n", + " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", + " that the widgets JavaScript is still loading. If this message persists, it\n", + " likely means that the widgets JavaScript library is either not installed or\n", + " not enabled. See the Jupyter\n", + " Widgets Documentation for setup instructions.\n", + "
\n", + "\n", + " If you're reading this message in another frontend (for example, a static\n", + " rendering on GitHub or NBViewer),\n", + " it may mean that your frontend doesn't currently support widgets.\n", + "
\n" + ], + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from tqdm import tqdm_notebook as tqdm\n", + "\n", + "epochs = 10\n", + "batch_size = 1000\n", + "\n", + "# Launch the graph\n", + "init = tf.global_variables_initializer()\n", + "sess = tf.Session()\n", + "\n", + "sess.run(init)\n", + "\n", + "for epoch in tqdm(range(epochs), unit='epoch'):\n", + " perm = np.random.permutation(X_train.shape[0])\n", + " # iterate over batches\n", + " for bX, bY in batcher(X_train[perm], y_train[perm], batch_size):\n", + " sess.run(optimizer, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.113785\n" + ] + } + ], + "source": [ + "errors = []\n", + "for bX, bY in batcher(X_test, y_test):\n", + " errors.append(sess.run(error, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)}))\n", + "\n", + "RMSE = np.sqrt(np.array(errors).mean())\n", + "print(RMSE)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "sess.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1.2405171]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "errors" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/draft/stacking.py b/draft/stacking.py new file mode 100644 index 0000000..915d778 --- /dev/null +++ b/draft/stacking.py @@ -0,0 +1,181 @@ +from sklearn.model_selection import KFold +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_digits +import numpy as np +from sklearn.svm import SVC +from sklearn import metrics +from sklearn.ensemble import RandomForestClassifier +from sklearn import preprocessing +import pandas as pd +from functools import reduce +from sklearn.metrics import confusion_matrix, classification_report + +class StackingClassifier(object): + + def __init__(self, modellist=[], meta_classifier=None): + self.modellist = modellist + if meta_classifier == None: + from sklearn.linear_model import LogisticRegression + meta_classifier = LogisticRegression() + self.meta_classifier = meta_classifier + + def SelectModel(self, modelname): + + if modelname == "SVM": + from sklearn.svm import SVC + model = SVC(kernel='rbf', C=16, gamma=0.125,probability=True) + + elif modelname == "lr": + from sklearn.linear_model import LogisticRegression + model = LogisticRegression() + + elif modelname == "GBDT": + from sklearn.ensemble import GradientBoostingClassifier + model = GradientBoostingClassifier() + + elif modelname == "RF": + from sklearn.ensemble import RandomForestClassifier + model = RandomForestClassifier() + + elif modelname == "xgboost": + from xgboost import XGBClassifier + model = XGBClassifier( + learning_rate=0.01, + n_estimators=1000, + max_depth=4, + min_child_weight=3, + gamma=0.1, + subsample=0.8, + colsample_bytree=0.8, + reg_alpha=1, + objective='binary:logistic', #multi:softmax + nthread=8, + scale_pos_weight=1, + seed=27, + random_state=27 + ) + elif modelname == "KNN": + from sklearn.neighbors import KNeighborsClassifier as knn + model = knn() + + elif modelname == "MNB": + from sklearn.naive_bayes import MultinomialNB + model = MultinomialNB() + else: + pass + return model + + def get_oof(self, clf, n_folds, X_train, y_train, X_test): + ntrain = X_train.shape[0] + ntest = X_test.shape[0] + print("kfolds: ", ntrain, ntest) + classnum = len(np.unique(y_train)) + kf = KFold(n_splits=n_folds,random_state=1) + oof_train = np.zeros((ntrain,classnum)) + oof_test = np.zeros((ntest,classnum)) + + for i,(train_index, test_index) in enumerate(kf.split(X_train)): + kf_X_train = X_train[train_index] # 数据 + kf_y_train = y_train[train_index] # 标签 + + kf_X_test = X_train[test_index] # k-fold的验证集 + + clf.fit(kf_X_train, kf_y_train) + oof_train[test_index] = clf.predict_proba(kf_X_test) + # print("shape of oof_train:", oof_train[test_index].shape) + + print("fold{i}: oof_train: {a}, oof_test:{b}".format(i=i, a=oof_train.shape, b=oof_test.shape)) + oof_test += clf.predict_proba(X_test) + oof_test = oof_test/float(n_folds) + print("oof_train: {a}, oof_test:{b}".format(a=oof_train.shape, b=oof_test.shape)) + return oof_train, oof_test + + def first_layer(self, X_train, y_train, X_test, modellist=None): + """modellist 需要重新修改 + """ + newfeature_list = [] + newtestdata_list = [] + for modelname in self.modellist: + sub_clf = self.SelectModel(modelname) + oof_train_, oof_test_= self.get_oof(clf=sub_clf, + n_folds=5, + X_train=X_train, + y_train=y_train, + X_test=X_test) + print("oof_train: ", oof_train_.shape) + print("model-{}".format(modelname),len(oof_train_), len(oof_test_)) + newfeature_list.append(oof_train_) + print("newfeature_list: ", len(newfeature_list)) + newtestdata_list.append(oof_test_) + + # 特征组合 + X_train_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newfeature_list) + X_test_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newtestdata_list) + + return X_train_stacking, X_test_stacking + + def fit(self, X_train, y_train, clf=None): + if clf != None: + self.meta_classifier = clf + self.meta_classifier.fit(X_train, y_train) + return self.meta_classifier + + #second_layer + def second_layer(self, X_train, y_train, clf=None): + return self.fit(X_train, y_train, clf) + + def predict(self, X_test, clf=None, type="label"): + if clf == None: + clf = self.meta_classifier + if type == "proba": + return clf.predict_proba(X_test) + elif type == "label": + return clf.predict(X_test) + + def get_accuracy(self, y_true, y_pred): + accuracy = metrics.accuracy_score(y_true, y_pred)*100 + return accuracy + + def performance(self, y_true, y_pred): + accuracy = self.get_accuracy(y_true, y_pred) + confusion = confusion_matrix(y_true, y_pred) + report = classification_report(y_true, y_pred) + print("多模型融合预测accuracy:{}".format(accuracy)) + print("混淆矩阵:\n{}".format(confusion)) + print("预测结果:\n{}".format(report)) + return confusion, report + + +# 使用stacking方法的时候 +# 第一级,重构特征当做第二级的训练集 +if __name__ == "__main__": + # 导入数据集切割训练与测试数据 + data = load_digits() + data_D = preprocessing.StandardScaler().fit_transform(data.data) + data_L = data.target + X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7) + print(set(y_train)) + + # 单纯使用一个分类器的时候 + clf_meta = RandomForestClassifier() + clf_meta.fit(X_train, y_train) + pred = clf_meta.predict(X_test) + accuracy = metrics.accuracy_score(y_test, pred)*100 + print ("====================", accuracy) + # 91.0969793323 + + #layer 1:多模型融合 + modelist = ['SVM', 'GBDT', 'RF', 'KNN'] + stacking_clf = StackingClassifier(modelist) + X_train_stacking, X_test_stacking = stacking_clf.first_layer(X_train, y_train, X_test) + print("shape of X_train_stacking {}".format(X_train_stacking.shape)) + print("shape of X_test_stacking {}".format(X_test_stacking.shape)) + + #layer 2: 单模型训练 + RF = stacking_clf.SelectModel(modelname="RF") + clf = stacking_clf.second_layer(X_train_stacking, y_train, clf=RF) + pred = stacking_clf.predict(X_test_stacking) + + #模型评估 + stacking_clf.performance(y_test, pred) + # 96.4228934817 diff --git a/stacking/stacking.py b/stacking/stacking.py new file mode 100644 index 0000000..4091365 --- /dev/null +++ b/stacking/stacking.py @@ -0,0 +1,241 @@ +from sklearn.model_selection import KFold +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_digits +import numpy as np +from sklearn.svm import SVC +from sklearn import metrics +from sklearn.ensemble import RandomForestClassifier +from sklearn import preprocessing +import pandas as pd +from functools import reduce +from sklearn.metrics import confusion_matrix, classification_report +from sklearn.linear_model import LogisticRegression +from sklearn.base import clone +import xgboost as xgb + +class SubClassifier(object): + def __init__(self): + # import lightgbm as lgb + # import xgboost as xgb + # from sklearn.svm import SVC + # from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier + # from sklearn.linear_model import LogisticRegression + # from sklearn.svm import LinearSVC + # clfs = { + # 'lr': LogisticRegression(penalty='l1', C=0.1, tol=0.0001), + # 'svm': LinearSVC(C=0.05, penalty='l2', dual=True), + # 'svm_linear': SVC(kernel='linear', probability=True), + # 'svm_ploy': SVC(kernel='poly', probability=True), + # 'bagging': BaggingClassifier(base_estimator=base_clf, n_estimators=60, max_samples=1.0, max_features=1.0, + # random_state=1, n_jobs=1, verbose=1), + # 'rf': RandomForestClassifier(n_estimators=40, criterion='gini', max_depth=9), + # 'adaboost': AdaBoostClassifier(base_estimator=base_clf, n_estimators=50, algorithm='SAMME'), + # 'gbdt': GradientBoostingClassifier(), + # 'xgb': xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=50), + # 'lgb': lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.01, max_depth=5, n_estimators=250, num_leaves=90) + # } + pass + + def SelectModel(self, modelname): + if modelname == "SVM": + from sklearn.svm import SVC + clf = SVC(kernel='rbf', C=16, gamma=0.125,probability=True) + + elif modelname == "lr": + from sklearn.linear_model import LogisticRegression + clf = LogisticRegression() + + elif modelname == "GBDT": + from sklearn.ensemble import GradientBoostingClassifier + clf = GradientBoostingClassifier() + + elif modelname == "RF": + from sklearn.ensemble import RandomForestClassifier + clf = RandomForestClassifier(n_estimators=100) + + elif modelname == "xgboost": + from xgboost import XGBClassifier + clf = XGBClassifier( + learning_rate=0.01, + n_estimators=1000, + max_depth=4, + min_child_weight=3, + gamma=0.1, + subsample=0.8, + colsample_bytree=0.8, + reg_alpha=1, + objective='binary:logistic', #multi:softmax + nthread=8, + scale_pos_weight=1, + seed=27, + random_state=27 + ) + elif modelname == "KNN": + from sklearn.neighbors import KNeighborsClassifier as knn + clf = knn() + + elif modelname == "MNB": + from sklearn.naive_bayes import MultinomialNB + clf = MultinomialNB() + else: + pass + return clf + + def performance(self, y_true, y_pred, modelname=""): + accuracy = metrics.accuracy_score(y_true, y_pred)*100 + confusion = confusion_matrix(y_true, y_pred) + report = classification_report(y_true, y_pred) + print("模型{}预测accuracy:{}".format(modelname, accuracy)) + print("混淆矩阵:\n{}".format(confusion)) + print("预测结果:\n{}".format(report)) + return confusion, report + + +class StackingClassifier(object): + + def __init__(self, classifiers, meta_classifier, + use_clones=True, n_folds=2, + n_classes=2, random_state=100, + sample_weight=None, use_probas=True): + + self.classifiers = classifiers + self.meta_classifier = meta_classifier + self.use_clones=use_clones + self.n_folds = n_folds + self.n_classes = n_classes + self.random_state = random_state + self.sample_weight = sample_weight + self.use_probas = use_probas + + def cross_valid_oof(self, clf, X, y, n_folds): + """返回CV预测结果 + """ + ntrain = X.shape[0] + n_classes = self.n_classes + random_state = self.random_state + oof_features = np.zeros((ntrain, n_classes)) + oof_pred = np.zeros(ntrain) + kf = KFold(n_splits=n_folds, random_state=random_state) + for i,(train_index, test_index) in enumerate(kf.split(X)): + kf_X_train = X[train_index] # 数据 + kf_y_train = y[train_index] # 标签 + + kf_X_test = X[test_index] # k-fold的验证集 + + clf.fit(kf_X_train, kf_y_train) + if not self.use_probas: + oof_features[test_index] = clf.predict(kf_X_test) + else: + oof_features[test_index] = clf.predict_proba(kf_X_test) + oof_pred[test_index] = clf.predict(kf_X_test) + print("fold-{i}: oof_features: {a}, cv-oof accuracy:{c}".format(i=i, + a=oof_features.shape, + c=self.get_accuracy(y[test_index], oof_pred[test_index]))) + return oof_features + + def fit(self, X, y): + self.clfs_ = self.classifiers + self.meta_clf_ = self.meta_classifier + + n_folds = self.n_folds + sample_weight = self.sample_weight + meta_features = None + + #feature layer + for name, sub_clf in self.clfs_.items(): + print("feature layer, current model: {}".format(name)) + meta_prediction = self.cross_valid_oof(sub_clf, X, y, n_folds) + if meta_features is None: + meta_features = meta_prediction + else: + meta_features = np.column_stack((meta_features, meta_prediction)) + + for name, model in self.clfs_.items(): + print("fit base model using all train set: {}".format(name)) + if sample_weight is None: + model.fit(X, y) + else: + model.fit(X, y, sample_weight=sample_weight) + + #meta layer + if sample_weight is None: + self.meta_clf_.fit(meta_features, y) + else: + self.meta_clf_.fit(meta_features, y, sample_weight=sample_weight) + + return self + + def predict_meta_features(self, X): + """ Get meta-features of test-data. + Parameters + ------- + X : numpy array, shape = [n_samples, n_features] + + Returns: + ------- + meta-features : numpy array, shape = [n_samples, n_classifiers] + """ + per_model_preds = [] + + for name, model in self.clfs_.items(): + print("model {} predict_meta_features".format(name)) + if not self.use_probas: + pred_score = model.predict(X) + else: + pred_score = model.predict_proba(X) + + per_model_preds.append(pred_score) + + return np.hstack(per_model_preds) + + + def predict(self, X): + """ Predict class label for X.""" + meta_features = self.predict_meta_features(X) + return self.meta_clf_.predict(meta_features) + + def predict_prob(self, X): + """ Predict class probabilities for X.""" + meta_features = self.predict_meta_features(X) + return self.meta_clf_.predict_proba(meta_features) + + def get_accuracy(self, y_true, y_pred): + accuracy = round(metrics.accuracy_score(y_true, y_pred)*100,3) + return accuracy + + def performance(self, y_true, y_pred): + accuracy = self.get_accuracy(y_true, y_pred) + confusion = confusion_matrix(y_true, y_pred) + report = classification_report(y_true, y_pred) + print("多模型融合预测accuracy:{}".format(accuracy)) + print("混淆矩阵:\n{}".format(confusion)) + print("预测结果:\n{}".format(report)) + return confusion, report + +# 使用stacking方法的时候 +if __name__ == "__main__": + # 导入数据集切割训练与测试数据 + data = load_digits() + data_D = preprocessing.StandardScaler().fit_transform(data.data) + data_L = data.target + X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7) + print(set(y_train)) + + #layer 1:多模型融合 + classifiers = { + 'KNN': SubClassifier().SelectModel(modelname="KNN"), + 'rf': SubClassifier().SelectModel(modelname="RF"), + 'svm': SubClassifier().SelectModel(modelname="SVM"), + 'GBDT': SubClassifier().SelectModel(modelname="GBDT") + } + + meta_classifier = SubClassifier().SelectModel(modelname="RF") + + stacking_clf = StackingClassifier(classifiers, meta_classifier, n_classes=10,n_folds=5) + + stacking_clf.fit(X_train, y_train) + pred = stacking_clf.predict(X_test) + + #模型评估 + stacking_clf.performance(y_test, pred) + # 96.4228934817 \ No newline at end of file