|
123 | 123 | "from IPython.display import Image"
|
124 | 124 | ]
|
125 | 125 | },
|
| 126 | + { |
| 127 | + "cell_type": "code", |
| 128 | + "execution_count": null, |
| 129 | + "metadata": { |
| 130 | + "collapsed": true |
| 131 | + }, |
| 132 | + "outputs": [], |
| 133 | + "source": [ |
| 134 | + "%matplotlib inline" |
| 135 | + ] |
| 136 | + }, |
126 | 137 | {
|
127 | 138 | "cell_type": "markdown",
|
128 | 139 | "metadata": {},
|
|
167 | 178 | ],
|
168 | 179 | "source": [
|
169 | 180 | "import matplotlib.pyplot as plt\n",
|
170 |
| - "%matplotlib inline\n", |
171 |
| - "plt.scatter(X[:,0], X[:,1], c='white', marker='o', s=50)\n", |
| 181 | + "\n", |
| 182 | + "plt.scatter(X[:, 0], X[:, 1], c='white', marker='o', s=50)\n", |
172 | 183 | "plt.grid()\n",
|
173 | 184 | "plt.tight_layout()\n",
|
174 | 185 | "#plt.savefig('./figures/spheres.png', dpi=300)\n",
|
|
195 | 206 | ],
|
196 | 207 | "source": [
|
197 | 208 | "from sklearn.cluster import KMeans\n",
|
| 209 | + "\n", |
198 | 210 | "km = KMeans(n_clusters=3, \n",
|
199 | 211 | " init='random', \n",
|
200 | 212 | " n_init=10, \n",
|
|
203 | 215 | " random_state=0)\n",
|
204 | 216 | "y_km = km.fit_predict(X)\n",
|
205 | 217 | "\n",
|
206 |
| - "plt.scatter(X[y_km==0,0], \n", |
207 |
| - " X[y_km==0,1], \n", |
208 |
| - " s=50, \n", |
209 |
| - " c='lightgreen', \n", |
210 |
| - " marker='s', \n", |
| 218 | + "plt.scatter(X[y_km == 0, 0],\n", |
| 219 | + " X[y_km == 0, 1],\n", |
| 220 | + " s=50,\n", |
| 221 | + " c='lightgreen',\n", |
| 222 | + " marker='s',\n", |
211 | 223 | " label='cluster 1')\n",
|
212 |
| - "plt.scatter(X[y_km==1,0], \n", |
213 |
| - " X[y_km==1,1], \n", |
214 |
| - " s=50, \n", |
215 |
| - " c='orange', \n", |
216 |
| - " marker='o', \n", |
| 224 | + "plt.scatter(X[y_km == 1, 0],\n", |
| 225 | + " X[y_km == 1, 1],\n", |
| 226 | + " s=50,\n", |
| 227 | + " c='orange',\n", |
| 228 | + " marker='o',\n", |
217 | 229 | " label='cluster 2')\n",
|
218 |
| - "plt.scatter(X[y_km==2,0], \n", |
219 |
| - " X[y_km==2,1], \n", |
220 |
| - " s=50, \n", |
221 |
| - " c='lightblue', \n", |
222 |
| - " marker='v', \n", |
| 230 | + "plt.scatter(X[y_km == 2, 0],\n", |
| 231 | + " X[y_km == 2, 1],\n", |
| 232 | + " s=50,\n", |
| 233 | + " c='lightblue',\n", |
| 234 | + " marker='v',\n", |
223 | 235 | " label='cluster 3')\n",
|
224 |
| - "plt.scatter(km.cluster_centers_[:,0], \n", |
225 |
| - " km.cluster_centers_[:,1], \n", |
226 |
| - " s=250, \n", |
227 |
| - " marker='*', \n", |
228 |
| - " c='red', \n", |
| 236 | + "plt.scatter(km.cluster_centers_[:, 0],\n", |
| 237 | + " km.cluster_centers_[:, 1],\n", |
| 238 | + " s=250,\n", |
| 239 | + " marker='*',\n", |
| 240 | + " c='red',\n", |
229 | 241 | " label='centroids')\n",
|
230 | 242 | "plt.legend()\n",
|
231 | 243 | "plt.grid()\n",
|
|
323 | 335 | " random_state=0)\n",
|
324 | 336 | " km.fit(X)\n",
|
325 | 337 | " distortions.append(km.inertia_)\n",
|
326 |
| - "plt.plot(range(1,11), distortions , marker='o')\n", |
| 338 | + "plt.plot(range(1, 11), distortions, marker='o')\n", |
327 | 339 | "plt.xlabel('Number of clusters')\n",
|
328 | 340 | "plt.ylabel('Distortion')\n",
|
329 | 341 | "plt.tight_layout()\n",
|
|
382 | 394 | "y_ax_lower, y_ax_upper = 0, 0\n",
|
383 | 395 | "yticks = []\n",
|
384 | 396 | "for i, c in enumerate(cluster_labels):\n",
|
385 |
| - " c_silhouette_vals = silhouette_vals[y_km==c]\n", |
| 397 | + " c_silhouette_vals = silhouette_vals[y_km == c]\n", |
386 | 398 | " c_silhouette_vals.sort()\n",
|
387 | 399 | " y_ax_upper += len(c_silhouette_vals)\n",
|
388 | 400 | " color = cm.jet(i / n_clusters)\n",
|
389 | 401 | " plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, \n",
|
390 |
| - " edgecolor='none', color=color)\n", |
| 402 | + " edgecolor='none', color=color)\n", |
391 | 403 | "\n",
|
392 |
| - " yticks.append((y_ax_lower + y_ax_upper) / 2)\n", |
| 404 | + " yticks.append((y_ax_lower + y_ax_upper) / 2.)\n", |
393 | 405 | " y_ax_lower += len(c_silhouette_vals)\n",
|
394 | 406 | " \n",
|
395 | 407 | "silhouette_avg = np.mean(silhouette_vals)\n",
|
|
430 | 442 | }
|
431 | 443 | ],
|
432 | 444 | "source": [
|
433 |
| - "km = KMeans(n_clusters=2, \n", |
434 |
| - " init='k-means++', \n", |
435 |
| - " n_init=10, \n", |
| 445 | + "km = KMeans(n_clusters=2,\n", |
| 446 | + " init='k-means++',\n", |
| 447 | + " n_init=10,\n", |
436 | 448 | " max_iter=300,\n",
|
437 | 449 | " tol=1e-04,\n",
|
438 | 450 | " random_state=0)\n",
|
439 | 451 | "y_km = km.fit_predict(X)\n",
|
440 | 452 | "\n",
|
441 |
| - "plt.scatter(X[y_km==0,0], \n", |
442 |
| - " X[y_km==0,1], \n", |
443 |
| - " s=50, \n", |
444 |
| - " c='lightgreen', \n", |
445 |
| - " marker='s', \n", |
| 453 | + "plt.scatter(X[y_km == 0, 0],\n", |
| 454 | + " X[y_km == 0, 1],\n", |
| 455 | + " s=50,\n", |
| 456 | + " c='lightgreen',\n", |
| 457 | + " marker='s',\n", |
446 | 458 | " label='cluster 1')\n",
|
447 |
| - "plt.scatter(X[y_km==1,0], \n", |
448 |
| - " X[y_km==1,1], \n", |
449 |
| - " s=50, \n", |
450 |
| - " c='orange', \n", |
451 |
| - " marker='o', \n", |
| 459 | + "plt.scatter(X[y_km == 1, 0],\n", |
| 460 | + " X[y_km == 1, 1],\n", |
| 461 | + " s=50,\n", |
| 462 | + " c='orange',\n", |
| 463 | + " marker='o',\n", |
452 | 464 | " label='cluster 2')\n",
|
453 | 465 | "\n",
|
454 |
| - "plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids')\n", |
| 466 | + "plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],\n", |
| 467 | + " s=250, marker='*', c='red', label='centroids')\n", |
455 | 468 | "plt.legend()\n",
|
456 | 469 | "plt.grid()\n",
|
457 | 470 | "plt.tight_layout()\n",
|
|
484 | 497 | "y_ax_lower, y_ax_upper = 0, 0\n",
|
485 | 498 | "yticks = []\n",
|
486 | 499 | "for i, c in enumerate(cluster_labels):\n",
|
487 |
| - " c_silhouette_vals = silhouette_vals[y_km==c]\n", |
| 500 | + " c_silhouette_vals = silhouette_vals[y_km == c]\n", |
488 | 501 | " c_silhouette_vals.sort()\n",
|
489 | 502 | " y_ax_upper += len(c_silhouette_vals)\n",
|
490 | 503 | " color = cm.jet(i / n_clusters)\n",
|
491 | 504 | " plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, \n",
|
492 |
| - " edgecolor='none', color=color)\n", |
| 505 | + " edgecolor='none', color=color)\n", |
493 | 506 | "\n",
|
494 |
| - " yticks.append((y_ax_lower + y_ax_upper) / 2)\n", |
| 507 | + " yticks.append((y_ax_lower + y_ax_upper) / 2.)\n", |
495 | 508 | " y_ax_lower += len(c_silhouette_vals)\n",
|
496 | 509 | " \n",
|
497 | 510 | "silhouette_avg = np.mean(silhouette_vals)\n",
|
|
624 | 637 | "np.random.seed(123)\n",
|
625 | 638 | "\n",
|
626 | 639 | "variables = ['X', 'Y', 'Z']\n",
|
627 |
| - "labels = ['ID_0','ID_1','ID_2','ID_3','ID_4']\n", |
| 640 | + "labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']\n", |
628 | 641 | "\n",
|
629 |
| - "X = np.random.random_sample([5,3])*10\n", |
| 642 | + "X = np.random.random_sample([5, 3])*10\n", |
630 | 643 | "df = pd.DataFrame(X, columns=variables, index=labels)\n",
|
631 | 644 | "df"
|
632 | 645 | ]
|
|
727 | 740 | }
|
728 | 741 | ],
|
729 | 742 | "source": [
|
730 |
| - "from scipy.spatial.distance import pdist,squareform\n", |
| 743 | + "from scipy.spatial.distance import pdist, squareform\n", |
731 | 744 | "\n",
|
732 |
| - "row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)\n", |
| 745 | + "row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')),\n", |
| 746 | + " columns=labels,\n", |
| 747 | + " index=labels)\n", |
733 | 748 | "row_dist"
|
734 | 749 | ]
|
735 | 750 | },
|
|
813 | 828 | "from scipy.cluster.hierarchy import linkage\n",
|
814 | 829 | "\n",
|
815 | 830 | "row_clusters = linkage(row_dist, method='complete', metric='euclidean')\n",
|
816 |
| - "pd.DataFrame(row_clusters, \n", |
817 |
| - " columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],\n", |
818 |
| - " index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])" |
| 831 | + "pd.DataFrame(row_clusters,\n", |
| 832 | + " columns=['row label 1', 'row label 2',\n", |
| 833 | + " 'distance', 'no. of items in clust.'],\n", |
| 834 | + " index=['cluster %d' % (i + 1)\n", |
| 835 | + " for i in range(row_clusters.shape[0])])" |
819 | 836 | ]
|
820 | 837 | },
|
821 | 838 | {
|
|
890 | 907 | "# 2. correct approach: Condensed distance matrix\n",
|
891 | 908 | "\n",
|
892 | 909 | "row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')\n",
|
893 |
| - "pd.DataFrame(row_clusters, \n", |
894 |
| - " columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],\n", |
895 |
| - " index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])" |
| 910 | + "pd.DataFrame(row_clusters,\n", |
| 911 | + " columns=['row label 1', 'row label 2',\n", |
| 912 | + " 'distance', 'no. of items in clust.'],\n", |
| 913 | + " index=['cluster %d' % (i + 1) \n", |
| 914 | + " for i in range(row_clusters.shape[0])])" |
896 | 915 | ]
|
897 | 916 | },
|
898 | 917 | {
|
|
965 | 984 | "source": [
|
966 | 985 | "# 3. correct approach: Input sample matrix\n",
|
967 | 986 | "\n",
|
968 |
| - "row_clusters = linkage(df.values, method='complete', metric='euclidean')\n", |
969 |
| - "pd.DataFrame(row_clusters, \n", |
970 |
| - " columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],\n", |
971 |
| - " index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])" |
| 987 | + "pd.DataFrame(row_clusters,\n", |
| 988 | + " columns=['row label 1', 'row label 2',\n", |
| 989 | + " 'distance', 'no. of items in clust.'],\n", |
| 990 | + " index=['cluster %d' % (i + 1)\n", |
| 991 | + " for i in range(row_clusters.shape[0])])" |
972 | 992 | ]
|
973 | 993 | },
|
974 | 994 | {
|
|
1042 | 1062 | ],
|
1043 | 1063 | "source": [
|
1044 | 1064 | "# plot row dendrogram\n",
|
1045 |
| - "fig = plt.figure(figsize=(8,8), facecolor='white')\n", |
1046 |
| - "axd = fig.add_axes([0.09,0.1,0.2,0.6])\n", |
| 1065 | + "fig = plt.figure(figsize=(8, 8), facecolor='white')\n", |
| 1066 | + "axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])\n", |
1047 | 1067 | "\n",
|
1048 | 1068 | "# note: for matplotlib < v1.5.1, please use orientation='right'\n",
|
1049 | 1069 | "row_dendr = dendrogram(row_clusters, orientation='left')\n",
|
|
1059 | 1079 | " i.set_visible(False)\n",
|
1060 | 1080 | "\n",
|
1061 | 1081 | "# plot heatmap\n",
|
1062 |
| - "axm = fig.add_axes([0.23,0.1,0.6,0.6]) # x-pos, y-pos, width, height\n", |
| 1082 | + "axm = fig.add_axes([0.23, 0.1, 0.6, 0.6]) # x-pos, y-pos, width, height\n", |
1063 | 1083 | "cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')\n",
|
1064 | 1084 | "fig.colorbar(cax)\n",
|
1065 | 1085 | "axm.set_xticklabels([''] + list(df_rowclust.columns))\n",
|
|
1101 | 1121 | "source": [
|
1102 | 1122 | "from sklearn.cluster import AgglomerativeClustering\n",
|
1103 | 1123 | "\n",
|
1104 |
| - "ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete')\n", |
| 1124 | + "ac = AgglomerativeClustering(n_clusters=2, \n", |
| 1125 | + " affinity='euclidean', \n", |
| 1126 | + " linkage='complete')\n", |
1105 | 1127 | "labels = ac.fit_predict(X)\n",
|
1106 | 1128 | "print('Cluster labels: %s' % labels)"
|
1107 | 1129 | ]
|
|
1170 | 1192 | "from sklearn.datasets import make_moons\n",
|
1171 | 1193 | "\n",
|
1172 | 1194 | "X, y = make_moons(n_samples=200, noise=0.05, random_state=0)\n",
|
1173 |
| - "plt.scatter(X[:,0], X[:,1])\n", |
| 1195 | + "plt.scatter(X[:, 0], X[:, 1])\n", |
1174 | 1196 | "plt.tight_layout()\n",
|
1175 |
| - "#plt.savefig('./figures/moons.png', dpi=300)\n", |
| 1197 | + "# plt.savefig('./figures/moons.png', dpi=300)\n", |
1176 | 1198 | "plt.show()"
|
1177 | 1199 | ]
|
1178 | 1200 | },
|
|
1202 | 1224 | }
|
1203 | 1225 | ],
|
1204 | 1226 | "source": [
|
1205 |
| - "f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,3))\n", |
| 1227 | + "f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))\n", |
1206 | 1228 | "\n",
|
1207 | 1229 | "km = KMeans(n_clusters=2, random_state=0)\n",
|
1208 | 1230 | "y_km = km.fit_predict(X)\n",
|
1209 |
| - "ax1.scatter(X[y_km==0,0], X[y_km==0,1], c='lightblue', marker='o', s=40, label='cluster 1')\n", |
1210 |
| - "ax1.scatter(X[y_km==1,0], X[y_km==1,1], c='red', marker='s', s=40, label='cluster 2')\n", |
| 1231 | + "ax1.scatter(X[y_km == 0, 0], X[y_km == 0, 1],\n", |
| 1232 | + " c='lightblue', marker='o', s=40, label='cluster 1')\n", |
| 1233 | + "ax1.scatter(X[y_km == 1, 0], X[y_km == 1, 1],\n", |
| 1234 | + " c='red', marker='s', s=40, label='cluster 2')\n", |
1211 | 1235 | "ax1.set_title('K-means clustering')\n",
|
1212 | 1236 | "\n",
|
1213 |
| - "ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete')\n", |
| 1237 | + "ac = AgglomerativeClustering(n_clusters=2,\n", |
| 1238 | + " affinity='euclidean',\n", |
| 1239 | + " linkage='complete')\n", |
1214 | 1240 | "y_ac = ac.fit_predict(X)\n",
|
1215 |
| - "ax2.scatter(X[y_ac==0,0], X[y_ac==0,1], c='lightblue', marker='o', s=40, label='cluster 1')\n", |
1216 |
| - "ax2.scatter(X[y_ac==1,0], X[y_ac==1,1], c='red', marker='s', s=40, label='cluster 2')\n", |
| 1241 | + "ax2.scatter(X[y_ac == 0, 0], X[y_ac == 0, 1], c='lightblue',\n", |
| 1242 | + " marker='o', s=40, label='cluster 1')\n", |
| 1243 | + "ax2.scatter(X[y_ac == 1, 0], X[y_ac == 1, 1], c='red',\n", |
| 1244 | + " marker='s', s=40, label='cluster 2')\n", |
1217 | 1245 | "ax2.set_title('Agglomerative clustering')\n",
|
1218 | 1246 | "\n",
|
1219 | 1247 | "plt.legend()\n",
|
|
1252 | 1280 | "\n",
|
1253 | 1281 | "db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')\n",
|
1254 | 1282 | "y_db = db.fit_predict(X)\n",
|
1255 |
| - "plt.scatter(X[y_db==0,0], X[y_db==0,1], c='lightblue', marker='o', s=40, label='cluster 1')\n", |
1256 |
| - "plt.scatter(X[y_db==1,0], X[y_db==1,1], c='red', marker='s', s=40, label='cluster 2')\n", |
| 1283 | + "plt.scatter(X[y_db == 0, 0], X[y_db == 0, 1],\n", |
| 1284 | + " c='lightblue', marker='o', s=40,\n", |
| 1285 | + " label='cluster 1')\n", |
| 1286 | + "plt.scatter(X[y_db == 1, 0], X[y_db == 1, 1],\n", |
| 1287 | + " c='red', marker='s', s=40,\n", |
| 1288 | + " label='cluster 2')\n", |
1257 | 1289 | "plt.legend()\n",
|
1258 | 1290 | "plt.tight_layout()\n",
|
1259 | 1291 | "#plt.savefig('./figures/moons_dbscan.png', dpi=300)\n",
|
|
0 commit comments