|
129 | 129 | "from IPython.display import Image"
|
130 | 130 | ]
|
131 | 131 | },
|
| 132 | + { |
| 133 | + "cell_type": "code", |
| 134 | + "execution_count": null, |
| 135 | + "metadata": { |
| 136 | + "collapsed": true |
| 137 | + }, |
| 138 | + "outputs": [], |
| 139 | + "source": [ |
| 140 | + "%matplotlib inline" |
| 141 | + ] |
| 142 | + }, |
132 | 143 | {
|
133 | 144 | "cell_type": "markdown",
|
134 | 145 | "metadata": {},
|
|
347 | 358 | "source": [
|
348 | 359 | "import pandas as pd\n",
|
349 | 360 | "\n",
|
350 |
| - "df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)\n", |
351 |
| - "df.head()" |
| 361 | + "df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases'\n", |
| 362 | + " '/breast-cancer-wisconsin/wdbc.data', header=None)" |
352 | 363 | ]
|
353 | 364 | },
|
354 | 365 | {
|
|
631 | 642 | "from sklearn.cross_validation import train_test_split\n",
|
632 | 643 | "\n",
|
633 | 644 | "X_train, X_test, y_train, y_test = \\\n",
|
634 |
| - " train_test_split(X, y, test_size=0.20, random_state=1)" |
| 645 | + " train_test_split(X, y, test_size=0.20, random_state=1)" |
635 | 646 | ]
|
636 | 647 | },
|
637 | 648 | {
|
|
671 | 682 | "from sklearn.pipeline import Pipeline\n",
|
672 | 683 | "\n",
|
673 | 684 | "pipe_lr = Pipeline([('scl', StandardScaler()),\n",
|
674 |
| - " ('pca', PCA(n_components=2)),\n", |
675 |
| - " ('clf', LogisticRegression(random_state=1))])\n", |
| 685 | + " ('pca', PCA(n_components=2)),\n", |
| 686 | + " ('clf', LogisticRegression(random_state=1))])\n", |
676 | 687 | "\n",
|
677 | 688 | "pipe_lr.fit(X_train, y_train)\n",
|
678 | 689 | "print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))\n",
|
|
843 | 854 | " pipe_lr.fit(X_train[train], y_train[train])\n",
|
844 | 855 | " score = pipe_lr.score(X_train[test], y_train[test])\n",
|
845 | 856 | " scores.append(score)\n",
|
846 |
| - " print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train]), score))\n", |
| 857 | + " print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,\n", |
| 858 | + " np.bincount(y_train[train]), score))\n", |
847 | 859 | " \n",
|
848 | 860 | "print('\\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))"
|
849 | 861 | ]
|
|
868 | 880 | "source": [
|
869 | 881 | "from sklearn.cross_validation import cross_val_score\n",
|
870 | 882 | "\n",
|
871 |
| - "scores = cross_val_score(estimator=pipe_lr, \n", |
872 |
| - " X=X_train, \n", |
873 |
| - " y=y_train, \n", |
| 883 | + "scores = cross_val_score(estimator=pipe_lr,\n", |
| 884 | + " X=X_train,\n", |
| 885 | + " y=y_train,\n", |
874 | 886 | " cv=10,\n",
|
875 | 887 | " n_jobs=1)\n",
|
876 | 888 | "print('CV accuracy scores: %s' % scores)\n",
|
|
953 | 965 | }
|
954 | 966 | ],
|
955 | 967 | "source": [
|
956 |
| - "%matplotlib inline\n", |
957 | 968 | "import matplotlib.pyplot as plt\n",
|
958 | 969 | "from sklearn.learning_curve import learning_curve\n",
|
959 | 970 | "\n",
|
960 | 971 | "pipe_lr = Pipeline([('scl', StandardScaler()),\n",
|
961 |
| - " ('clf', LogisticRegression(penalty='l2', random_state=0))])\n", |
| 972 | + " ('clf', LogisticRegression(penalty='l2', random_state=0))])\n", |
962 | 973 | "\n",
|
963 | 974 | "train_sizes, train_scores, test_scores =\\\n",
|
964 |
| - " learning_curve(estimator=pipe_lr, \n", |
965 |
| - " X=X_train, \n", |
966 |
| - " y=y_train, \n", |
967 |
| - " train_sizes=np.linspace(0.1, 1.0, 10), \n", |
968 |
| - " cv=10,\n", |
969 |
| - " n_jobs=1)\n", |
| 975 | + " learning_curve(estimator=pipe_lr,\n", |
| 976 | + " X=X_train,\n", |
| 977 | + " y=y_train,\n", |
| 978 | + " train_sizes=np.linspace(0.1, 1.0, 10),\n", |
| 979 | + " cv=10,\n", |
| 980 | + " n_jobs=1)\n", |
970 | 981 | "\n",
|
971 | 982 | "train_mean = np.mean(train_scores, axis=1)\n",
|
972 | 983 | "train_std = np.std(train_scores, axis=1)\n",
|
973 | 984 | "test_mean = np.mean(test_scores, axis=1)\n",
|
974 | 985 | "test_std = np.std(test_scores, axis=1)\n",
|
975 | 986 | "\n",
|
976 |
| - "plt.plot(train_sizes, train_mean, \n", |
977 |
| - " color='blue', marker='o', \n", |
| 987 | + "plt.plot(train_sizes, train_mean,\n", |
| 988 | + " color='blue', marker='o',\n", |
978 | 989 | " markersize=5, label='training accuracy')\n",
|
979 | 990 | "\n",
|
980 |
| - "plt.fill_between(train_sizes, \n", |
| 991 | + "plt.fill_between(train_sizes,\n", |
981 | 992 | " train_mean + train_std,\n",
|
982 |
| - " train_mean - train_std, \n", |
| 993 | + " train_mean - train_std,\n", |
983 | 994 | " alpha=0.15, color='blue')\n",
|
984 | 995 | "\n",
|
985 |
| - "plt.plot(train_sizes, test_mean, \n", |
986 |
| - " color='green', linestyle='--', \n", |
987 |
| - " marker='s', markersize=5, \n", |
| 996 | + "plt.plot(train_sizes, test_mean,\n", |
| 997 | + " color='green', linestyle='--',\n", |
| 998 | + " marker='s', markersize=5,\n", |
988 | 999 | " label='validation accuracy')\n",
|
989 | 1000 | "\n",
|
990 |
| - "plt.fill_between(train_sizes, \n", |
| 1001 | + "plt.fill_between(train_sizes,\n", |
991 | 1002 | " test_mean + test_std,\n",
|
992 |
| - " test_mean - test_std, \n", |
| 1003 | + " test_mean - test_std,\n", |
993 | 1004 | " alpha=0.15, color='green')\n",
|
994 | 1005 | "\n",
|
995 | 1006 | "plt.grid()\n",
|
|
1231 | 1242 | }
|
1232 | 1243 | ],
|
1233 | 1244 | "source": [
|
1234 |
| - "gs = GridSearchCV(estimator=pipe_svc, \n", |
1235 |
| - " param_grid=param_grid, \n", |
1236 |
| - " scoring='accuracy', \n", |
1237 |
| - " cv=2)\n", |
| 1245 | + "gs = GridSearchCV(estimator=pipe_svc,\n", |
| 1246 | + " param_grid=param_grid,\n", |
| 1247 | + " scoring='accuracy',\n", |
| 1248 | + " cv=2)\n", |
1238 | 1249 | "\n",
|
1239 | 1250 | "# Note: Optionally, you could use cv=2 \n",
|
1240 | 1251 | "# in the GridSearchCV above to produce\n",
|
|
1261 | 1272 | ],
|
1262 | 1273 | "source": [
|
1263 | 1274 | "from sklearn.tree import DecisionTreeClassifier\n",
|
1264 |
| - "gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0), \n", |
1265 |
| - " param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}], \n", |
1266 |
| - " scoring='accuracy', \n", |
1267 |
| - " cv=2)\n", |
| 1275 | + "gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),\n", |
| 1276 | + " param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}],\n", |
| 1277 | + " scoring='accuracy',\n", |
| 1278 | + " cv=2)\n", |
1268 | 1279 | "scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)\n",
|
1269 | 1280 | "print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))"
|
1270 | 1281 | ]
|
|
1561 | 1572 | }
|
1562 | 1573 | ],
|
1563 | 1574 | "source": [
|
1564 |
| - "from sklearn.metrics import make_scorer, f1_score\n", |
| 1575 | + "from sklearn.metrics import make_scorer\n", |
1565 | 1576 | "\n",
|
1566 | 1577 | "scorer = make_scorer(f1_score, pos_label=0)\n",
|
1567 | 1578 | "\n",
|
1568 | 1579 | "c_gamma_range = [0.01, 0.1, 1.0, 10.0]\n",
|
1569 | 1580 | "\n",
|
1570 |
| - "param_grid = [{'clf__C': c_gamma_range, \n", |
| 1581 | + "param_grid = [{'clf__C': c_gamma_range,\n", |
1571 | 1582 | " 'clf__kernel': ['linear']},\n",
|
1572 |
| - " {'clf__C': c_gamma_range, \n", |
1573 |
| - " 'clf__gamma': c_gamma_range, \n", |
1574 |
| - " 'clf__kernel': ['rbf'],}]\n", |
| 1583 | + " {'clf__C': c_gamma_range,\n", |
| 1584 | + " 'clf__gamma': c_gamma_range,\n", |
| 1585 | + " 'clf__kernel': ['rbf']}]\n", |
1575 | 1586 | "\n",
|
1576 |
| - "gs = GridSearchCV(estimator=pipe_svc, \n", |
1577 |
| - " param_grid=param_grid, \n", |
1578 |
| - " scoring=scorer, \n", |
1579 |
| - " cv=10,\n", |
1580 |
| - " n_jobs=-1)\n", |
| 1587 | + "gs = GridSearchCV(estimator=pipe_svc,\n", |
| 1588 | + " param_grid=param_grid,\n", |
| 1589 | + " scoring=scorer,\n", |
| 1590 | + " cv=10,\n", |
| 1591 | + " n_jobs=-1)\n", |
1581 | 1592 | "gs = gs.fit(X_train, y_train)\n",
|
1582 | 1593 | "print(gs.best_score_)\n",
|
1583 | 1594 | "print(gs.best_params_)"
|
|
1637 | 1648 | "all_tpr = []\n",
|
1638 | 1649 | "\n",
|
1639 | 1650 | "for i, (train, test) in enumerate(cv):\n",
|
1640 |
| - " probas = pipe_lr.fit(X_train2[train], \n", |
| 1651 | + " probas = pipe_lr.fit(X_train2[train],\n", |
1641 | 1652 | " y_train[train]).predict_proba(X_train2[test])\n",
|
1642 |
| - " \n", |
1643 |
| - " fpr, tpr, thresholds = roc_curve(y_train[test], \n", |
1644 |
| - " probas[:, 1], \n", |
| 1653 | + "\n", |
| 1654 | + " fpr, tpr, thresholds = roc_curve(y_train[test],\n", |
| 1655 | + " probas[:, 1],\n", |
1645 | 1656 | " pos_label=1)\n",
|
1646 | 1657 | " mean_tpr += interp(mean_fpr, fpr, tpr)\n",
|
1647 | 1658 | " mean_tpr[0] = 0.0\n",
|
1648 | 1659 | " roc_auc = auc(fpr, tpr)\n",
|
1649 |
| - " plt.plot(fpr, \n", |
1650 |
| - " tpr, \n", |
1651 |
| - " lw=1, \n", |
1652 |
| - " label='ROC fold %d (area = %0.2f)' \n", |
1653 |
| - " % (i+1, roc_auc))\n", |
| 1660 | + " plt.plot(fpr,\n", |
| 1661 | + " tpr,\n", |
| 1662 | + " lw=1,\n", |
| 1663 | + " label='ROC fold %d (area = %0.2f)'\n", |
| 1664 | + " % (i+1, roc_auc))\n", |
1654 | 1665 | "\n",
|
1655 |
| - "plt.plot([0, 1], \n", |
1656 |
| - " [0, 1], \n", |
1657 |
| - " linestyle='--', \n", |
1658 |
| - " color=(0.6, 0.6, 0.6), \n", |
| 1666 | + "plt.plot([0, 1],\n", |
| 1667 | + " [0, 1],\n", |
| 1668 | + " linestyle='--',\n", |
| 1669 | + " color=(0.6, 0.6, 0.6),\n", |
1659 | 1670 | " label='random guessing')\n",
|
1660 | 1671 | "\n",
|
1661 | 1672 | "mean_tpr /= len(cv)\n",
|
1662 | 1673 | "mean_tpr[-1] = 1.0\n",
|
1663 | 1674 | "mean_auc = auc(mean_fpr, mean_tpr)\n",
|
1664 | 1675 | "plt.plot(mean_fpr, mean_tpr, 'k--',\n",
|
1665 | 1676 | " label='mean ROC (area = %0.2f)' % mean_auc, lw=2)\n",
|
1666 |
| - "plt.plot([0, 0, 1], \n", |
1667 |
| - " [0, 1, 1], \n", |
1668 |
| - " lw=2, \n", |
1669 |
| - " linestyle=':', \n", |
1670 |
| - " color='black', \n", |
| 1677 | + "plt.plot([0, 0, 1],\n", |
| 1678 | + " [0, 1, 1],\n", |
| 1679 | + " lw=2,\n", |
| 1680 | + " linestyle=':',\n", |
| 1681 | + " color='black',\n", |
1671 | 1682 | " label='perfect performance')\n",
|
1672 | 1683 | "\n",
|
1673 | 1684 | "plt.xlim([-0.05, 1.05])\n",
|
|
0 commit comments