From 897267b8609d6bc6b5815a3eb80c5f9f8ccb197f Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:26:19 +0800 Subject: [PATCH 1/7] add wx photo --- .DS_Store | Bin 0 -> 6148 bytes README.md | 8 +++++++- ...xperiments with a New Boosting Algorithm.pdf} | Bin 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 .DS_Store rename doc/{boostingexperiments.pdf => 1996 Experiments with a New Boosting Algorithm.pdf} (100%) diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..bffc98b573f7e579f5881366afafadada05bde95 GIT binary patch literal 6148 zcmeHK%}N6?5T3ME)2i5mpx*Z4tq0eC55iLG!JDw62Nm75i!RiSbhj3*m3!kIdeM-HUn%KAZ1yWFRDwu-iEtXb5?h@`!%QMx?9*;U)${W%W`4a-95OxeR%9W^`Bo1%>utj zEprwJ@QTKdJHGeZNlPWy7&FKp%m6bmJ`C6!PEL={mUt)305kAQ4AA}Hpb~l( zbA$Toz=nPwX}m;8f;PP+2px-_#oQo{pa_$SXi|lJVhEFte#gdn7IT9p9fV#P=dmjb z`$7?Vb@V$r9E4|(TV{Y67-pbox>c(G`#-<`hm*L+3@`&@#egU^{6-zOWNYil=BU;> tsF$cD6qg(POhH4JVvMCyyojm={SFz3p2gfCdQkXBK-0htGw`Pjd;qX|ReS&d literal 0 HcmV?d00001 diff --git a/README.md b/README.md index 6ed389f..b158257 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,15 @@ ## Contributor -- 刘帝伟, 中南大学14级硕士,关注AI、机器学习、深度学习方向,[HomePage](http://www.csuldw.com). +- 刘帝伟, CSU硕士毕业,关注AI、机器学习、深度学习方向,[HomePage](http://www.csuldw.com). ## Contact +如果有任何疑问,可在我的微信公众号后台留言: + +![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) + +或是发邮件吧: + - E-mail: csu.ldw@csu.edu.cn diff --git a/doc/boostingexperiments.pdf b/doc/1996 Experiments with a New Boosting Algorithm.pdf similarity index 100% rename from doc/boostingexperiments.pdf rename to doc/1996 Experiments with a New Boosting Algorithm.pdf From f9cf3f2582d1bfbb18dcc25bc1b1d1a67822a8e9 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:28:33 +0800 Subject: [PATCH 2/7] add wx photo --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index b158257..5b452ae 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,9 @@ ![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) + +
Fig 1:推荐系统整体结构.
+ 或是发邮件吧: - E-mail: csu.ldw@csu.edu.cn From 44cf0c0f576e550ff6f17ba854ff71c82f9b4ebb Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:29:40 +0800 Subject: [PATCH 3/7] add wx photo --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5b452ae..0f9f865 100644 --- a/README.md +++ b/README.md @@ -45,10 +45,10 @@ 如果有任何疑问,可在我的微信公众号后台留言: -![](http://www.csuldw.com/assets/articleImg/2019/code-main-fun.png) - + +
-
Fig 1:推荐系统整体结构.
+
或是发邮件吧: From 97c5aa0811e58f8ff635797b2382fa9949f01491 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:31:08 +0800 Subject: [PATCH 4/7] add wx photo --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f9f865..a47b09c 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,10 @@ 如果有任何疑问,可在我的微信公众号后台留言: -
+ +
-
+ 或是发邮件吧: From a93595a55f860fcc91acbc35d40836d275cdcee7 Mon Sep 17 00:00:00 2001 From: csuldw Date: Sun, 1 Sep 2019 14:32:12 +0800 Subject: [PATCH 5/7] add wx photo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a47b09c..7831611 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@
- +
或是发邮件吧: From 9a5daa063034878b7fd7c594a48aed0fc8a23606 Mon Sep 17 00:00:00 2001 From: csuldw Date: Wed, 16 Oct 2019 00:32:12 +0800 Subject: [PATCH 6/7] add stacking model draft --- draft/stacking.py | 181 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 draft/stacking.py diff --git a/draft/stacking.py b/draft/stacking.py new file mode 100644 index 0000000..915d778 --- /dev/null +++ b/draft/stacking.py @@ -0,0 +1,181 @@ +from sklearn.model_selection import KFold +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_digits +import numpy as np +from sklearn.svm import SVC +from sklearn import metrics +from sklearn.ensemble import RandomForestClassifier +from sklearn import preprocessing +import pandas as pd +from functools import reduce +from sklearn.metrics import confusion_matrix, classification_report + +class StackingClassifier(object): + + def __init__(self, modellist=[], meta_classifier=None): + self.modellist = modellist + if meta_classifier == None: + from sklearn.linear_model import LogisticRegression + meta_classifier = LogisticRegression() + self.meta_classifier = meta_classifier + + def SelectModel(self, modelname): + + if modelname == "SVM": + from sklearn.svm import SVC + model = SVC(kernel='rbf', C=16, gamma=0.125,probability=True) + + elif modelname == "lr": + from sklearn.linear_model import LogisticRegression + model = LogisticRegression() + + elif modelname == "GBDT": + from sklearn.ensemble import GradientBoostingClassifier + model = GradientBoostingClassifier() + + elif modelname == "RF": + from sklearn.ensemble import RandomForestClassifier + model = RandomForestClassifier() + + elif modelname == "xgboost": + from xgboost import XGBClassifier + model = XGBClassifier( + learning_rate=0.01, + n_estimators=1000, + max_depth=4, + min_child_weight=3, + gamma=0.1, + subsample=0.8, + colsample_bytree=0.8, + reg_alpha=1, + objective='binary:logistic', #multi:softmax + nthread=8, + scale_pos_weight=1, + seed=27, + random_state=27 + ) + elif modelname == "KNN": + from sklearn.neighbors import KNeighborsClassifier as knn + model = knn() + + elif modelname == "MNB": + from sklearn.naive_bayes import MultinomialNB + model = MultinomialNB() + else: + pass + return model + + def get_oof(self, clf, n_folds, X_train, y_train, X_test): + ntrain = X_train.shape[0] + ntest = X_test.shape[0] + print("kfolds: ", ntrain, ntest) + classnum = len(np.unique(y_train)) + kf = KFold(n_splits=n_folds,random_state=1) + oof_train = np.zeros((ntrain,classnum)) + oof_test = np.zeros((ntest,classnum)) + + for i,(train_index, test_index) in enumerate(kf.split(X_train)): + kf_X_train = X_train[train_index] # 数据 + kf_y_train = y_train[train_index] # 标签 + + kf_X_test = X_train[test_index] # k-fold的验证集 + + clf.fit(kf_X_train, kf_y_train) + oof_train[test_index] = clf.predict_proba(kf_X_test) + # print("shape of oof_train:", oof_train[test_index].shape) + + print("fold{i}: oof_train: {a}, oof_test:{b}".format(i=i, a=oof_train.shape, b=oof_test.shape)) + oof_test += clf.predict_proba(X_test) + oof_test = oof_test/float(n_folds) + print("oof_train: {a}, oof_test:{b}".format(a=oof_train.shape, b=oof_test.shape)) + return oof_train, oof_test + + def first_layer(self, X_train, y_train, X_test, modellist=None): + """modellist 需要重新修改 + """ + newfeature_list = [] + newtestdata_list = [] + for modelname in self.modellist: + sub_clf = self.SelectModel(modelname) + oof_train_, oof_test_= self.get_oof(clf=sub_clf, + n_folds=5, + X_train=X_train, + y_train=y_train, + X_test=X_test) + print("oof_train: ", oof_train_.shape) + print("model-{}".format(modelname),len(oof_train_), len(oof_test_)) + newfeature_list.append(oof_train_) + print("newfeature_list: ", len(newfeature_list)) + newtestdata_list.append(oof_test_) + + # 特征组合 + X_train_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newfeature_list) + X_test_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newtestdata_list) + + return X_train_stacking, X_test_stacking + + def fit(self, X_train, y_train, clf=None): + if clf != None: + self.meta_classifier = clf + self.meta_classifier.fit(X_train, y_train) + return self.meta_classifier + + #second_layer + def second_layer(self, X_train, y_train, clf=None): + return self.fit(X_train, y_train, clf) + + def predict(self, X_test, clf=None, type="label"): + if clf == None: + clf = self.meta_classifier + if type == "proba": + return clf.predict_proba(X_test) + elif type == "label": + return clf.predict(X_test) + + def get_accuracy(self, y_true, y_pred): + accuracy = metrics.accuracy_score(y_true, y_pred)*100 + return accuracy + + def performance(self, y_true, y_pred): + accuracy = self.get_accuracy(y_true, y_pred) + confusion = confusion_matrix(y_true, y_pred) + report = classification_report(y_true, y_pred) + print("多模型融合预测accuracy:{}".format(accuracy)) + print("混淆矩阵:\n{}".format(confusion)) + print("预测结果:\n{}".format(report)) + return confusion, report + + +# 使用stacking方法的时候 +# 第一级,重构特征当做第二级的训练集 +if __name__ == "__main__": + # 导入数据集切割训练与测试数据 + data = load_digits() + data_D = preprocessing.StandardScaler().fit_transform(data.data) + data_L = data.target + X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7) + print(set(y_train)) + + # 单纯使用一个分类器的时候 + clf_meta = RandomForestClassifier() + clf_meta.fit(X_train, y_train) + pred = clf_meta.predict(X_test) + accuracy = metrics.accuracy_score(y_test, pred)*100 + print ("====================", accuracy) + # 91.0969793323 + + #layer 1:多模型融合 + modelist = ['SVM', 'GBDT', 'RF', 'KNN'] + stacking_clf = StackingClassifier(modelist) + X_train_stacking, X_test_stacking = stacking_clf.first_layer(X_train, y_train, X_test) + print("shape of X_train_stacking {}".format(X_train_stacking.shape)) + print("shape of X_test_stacking {}".format(X_test_stacking.shape)) + + #layer 2: 单模型训练 + RF = stacking_clf.SelectModel(modelname="RF") + clf = stacking_clf.second_layer(X_train_stacking, y_train, clf=RF) + pred = stacking_clf.predict(X_test_stacking) + + #模型评估 + stacking_clf.performance(y_test, pred) + # 96.4228934817 From 5f1569ad6a3ca015b8c07f76a19e8b44462bb6cd Mon Sep 17 00:00:00 2001 From: csuldw Date: Sat, 19 Oct 2019 21:52:26 +0800 Subject: [PATCH 7/7] add stacking --- .DS_Store | Bin 6148 -> 6148 bytes stacking/stacking.py | 241 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 stacking/stacking.py diff --git a/.DS_Store b/.DS_Store index bffc98b573f7e579f5881366afafadada05bde95..bc267768983cb470703ff7c8ba16d687b7b1fb40 100644 GIT binary patch delta 51 zcmZoMXfc@J&&a$nU^gQp^W=X_GMhD+7#RgQ7>XH67!nzh8L}BN8S)s?H%l>}W!cQm H@s}R}WbzGn delta 38 ucmZoMXfc@J&&aefU^nAr0}+