Skip to content

Commit b1bbcd8

Browse files
committed
exten"t" instead "d" on p.23
1 parent f3d3605 commit b1bbcd8

File tree

5 files changed

+50295
-283
lines changed

5 files changed

+50295
-283
lines changed

code/ch11/ch11.ipynb

+103-71
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,17 @@
123123
"from IPython.display import Image"
124124
]
125125
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": null,
129+
"metadata": {
130+
"collapsed": true
131+
},
132+
"outputs": [],
133+
"source": [
134+
"%matplotlib inline"
135+
]
136+
},
126137
{
127138
"cell_type": "markdown",
128139
"metadata": {},
@@ -167,8 +178,8 @@
167178
],
168179
"source": [
169180
"import matplotlib.pyplot as plt\n",
170-
"%matplotlib inline\n",
171-
"plt.scatter(X[:,0], X[:,1], c='white', marker='o', s=50)\n",
181+
"\n",
182+
"plt.scatter(X[:, 0], X[:, 1], c='white', marker='o', s=50)\n",
172183
"plt.grid()\n",
173184
"plt.tight_layout()\n",
174185
"#plt.savefig('./figures/spheres.png', dpi=300)\n",
@@ -195,6 +206,7 @@
195206
],
196207
"source": [
197208
"from sklearn.cluster import KMeans\n",
209+
"\n",
198210
"km = KMeans(n_clusters=3, \n",
199211
" init='random', \n",
200212
" n_init=10, \n",
@@ -203,29 +215,29 @@
203215
" random_state=0)\n",
204216
"y_km = km.fit_predict(X)\n",
205217
"\n",
206-
"plt.scatter(X[y_km==0,0], \n",
207-
" X[y_km==0,1], \n",
208-
" s=50, \n",
209-
" c='lightgreen', \n",
210-
" marker='s', \n",
218+
"plt.scatter(X[y_km == 0, 0],\n",
219+
" X[y_km == 0, 1],\n",
220+
" s=50,\n",
221+
" c='lightgreen',\n",
222+
" marker='s',\n",
211223
" label='cluster 1')\n",
212-
"plt.scatter(X[y_km==1,0], \n",
213-
" X[y_km==1,1], \n",
214-
" s=50, \n",
215-
" c='orange', \n",
216-
" marker='o', \n",
224+
"plt.scatter(X[y_km == 1, 0],\n",
225+
" X[y_km == 1, 1],\n",
226+
" s=50,\n",
227+
" c='orange',\n",
228+
" marker='o',\n",
217229
" label='cluster 2')\n",
218-
"plt.scatter(X[y_km==2,0], \n",
219-
" X[y_km==2,1], \n",
220-
" s=50, \n",
221-
" c='lightblue', \n",
222-
" marker='v', \n",
230+
"plt.scatter(X[y_km == 2, 0],\n",
231+
" X[y_km == 2, 1],\n",
232+
" s=50,\n",
233+
" c='lightblue',\n",
234+
" marker='v',\n",
223235
" label='cluster 3')\n",
224-
"plt.scatter(km.cluster_centers_[:,0], \n",
225-
" km.cluster_centers_[:,1], \n",
226-
" s=250, \n",
227-
" marker='*', \n",
228-
" c='red', \n",
236+
"plt.scatter(km.cluster_centers_[:, 0],\n",
237+
" km.cluster_centers_[:, 1],\n",
238+
" s=250,\n",
239+
" marker='*',\n",
240+
" c='red',\n",
229241
" label='centroids')\n",
230242
"plt.legend()\n",
231243
"plt.grid()\n",
@@ -323,7 +335,7 @@
323335
" random_state=0)\n",
324336
" km.fit(X)\n",
325337
" distortions.append(km.inertia_)\n",
326-
"plt.plot(range(1,11), distortions , marker='o')\n",
338+
"plt.plot(range(1, 11), distortions, marker='o')\n",
327339
"plt.xlabel('Number of clusters')\n",
328340
"plt.ylabel('Distortion')\n",
329341
"plt.tight_layout()\n",
@@ -382,14 +394,14 @@
382394
"y_ax_lower, y_ax_upper = 0, 0\n",
383395
"yticks = []\n",
384396
"for i, c in enumerate(cluster_labels):\n",
385-
" c_silhouette_vals = silhouette_vals[y_km==c]\n",
397+
" c_silhouette_vals = silhouette_vals[y_km == c]\n",
386398
" c_silhouette_vals.sort()\n",
387399
" y_ax_upper += len(c_silhouette_vals)\n",
388400
" color = cm.jet(i / n_clusters)\n",
389401
" plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, \n",
390-
" edgecolor='none', color=color)\n",
402+
" edgecolor='none', color=color)\n",
391403
"\n",
392-
" yticks.append((y_ax_lower + y_ax_upper) / 2)\n",
404+
" yticks.append((y_ax_lower + y_ax_upper) / 2.)\n",
393405
" y_ax_lower += len(c_silhouette_vals)\n",
394406
" \n",
395407
"silhouette_avg = np.mean(silhouette_vals)\n",
@@ -430,28 +442,29 @@
430442
}
431443
],
432444
"source": [
433-
"km = KMeans(n_clusters=2, \n",
434-
" init='k-means++', \n",
435-
" n_init=10, \n",
445+
"km = KMeans(n_clusters=2,\n",
446+
" init='k-means++',\n",
447+
" n_init=10,\n",
436448
" max_iter=300,\n",
437449
" tol=1e-04,\n",
438450
" random_state=0)\n",
439451
"y_km = km.fit_predict(X)\n",
440452
"\n",
441-
"plt.scatter(X[y_km==0,0], \n",
442-
" X[y_km==0,1], \n",
443-
" s=50, \n",
444-
" c='lightgreen', \n",
445-
" marker='s', \n",
453+
"plt.scatter(X[y_km == 0, 0],\n",
454+
" X[y_km == 0, 1],\n",
455+
" s=50,\n",
456+
" c='lightgreen',\n",
457+
" marker='s',\n",
446458
" label='cluster 1')\n",
447-
"plt.scatter(X[y_km==1,0], \n",
448-
" X[y_km==1,1], \n",
449-
" s=50, \n",
450-
" c='orange', \n",
451-
" marker='o', \n",
459+
"plt.scatter(X[y_km == 1, 0],\n",
460+
" X[y_km == 1, 1],\n",
461+
" s=50,\n",
462+
" c='orange',\n",
463+
" marker='o',\n",
452464
" label='cluster 2')\n",
453465
"\n",
454-
"plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids')\n",
466+
"plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],\n",
467+
" s=250, marker='*', c='red', label='centroids')\n",
455468
"plt.legend()\n",
456469
"plt.grid()\n",
457470
"plt.tight_layout()\n",
@@ -484,14 +497,14 @@
484497
"y_ax_lower, y_ax_upper = 0, 0\n",
485498
"yticks = []\n",
486499
"for i, c in enumerate(cluster_labels):\n",
487-
" c_silhouette_vals = silhouette_vals[y_km==c]\n",
500+
" c_silhouette_vals = silhouette_vals[y_km == c]\n",
488501
" c_silhouette_vals.sort()\n",
489502
" y_ax_upper += len(c_silhouette_vals)\n",
490503
" color = cm.jet(i / n_clusters)\n",
491504
" plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, \n",
492-
" edgecolor='none', color=color)\n",
505+
" edgecolor='none', color=color)\n",
493506
"\n",
494-
" yticks.append((y_ax_lower + y_ax_upper) / 2)\n",
507+
" yticks.append((y_ax_lower + y_ax_upper) / 2.)\n",
495508
" y_ax_lower += len(c_silhouette_vals)\n",
496509
" \n",
497510
"silhouette_avg = np.mean(silhouette_vals)\n",
@@ -624,9 +637,9 @@
624637
"np.random.seed(123)\n",
625638
"\n",
626639
"variables = ['X', 'Y', 'Z']\n",
627-
"labels = ['ID_0','ID_1','ID_2','ID_3','ID_4']\n",
640+
"labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']\n",
628641
"\n",
629-
"X = np.random.random_sample([5,3])*10\n",
642+
"X = np.random.random_sample([5, 3])*10\n",
630643
"df = pd.DataFrame(X, columns=variables, index=labels)\n",
631644
"df"
632645
]
@@ -727,9 +740,11 @@
727740
}
728741
],
729742
"source": [
730-
"from scipy.spatial.distance import pdist,squareform\n",
743+
"from scipy.spatial.distance import pdist, squareform\n",
731744
"\n",
732-
"row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)\n",
745+
"row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')),\n",
746+
" columns=labels,\n",
747+
" index=labels)\n",
733748
"row_dist"
734749
]
735750
},
@@ -813,9 +828,11 @@
813828
"from scipy.cluster.hierarchy import linkage\n",
814829
"\n",
815830
"row_clusters = linkage(row_dist, method='complete', metric='euclidean')\n",
816-
"pd.DataFrame(row_clusters, \n",
817-
" columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],\n",
818-
" index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])"
831+
"pd.DataFrame(row_clusters,\n",
832+
" columns=['row label 1', 'row label 2',\n",
833+
" 'distance', 'no. of items in clust.'],\n",
834+
" index=['cluster %d' % (i + 1)\n",
835+
" for i in range(row_clusters.shape[0])])"
819836
]
820837
},
821838
{
@@ -890,9 +907,11 @@
890907
"# 2. correct approach: Condensed distance matrix\n",
891908
"\n",
892909
"row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')\n",
893-
"pd.DataFrame(row_clusters, \n",
894-
" columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],\n",
895-
" index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])"
910+
"pd.DataFrame(row_clusters,\n",
911+
" columns=['row label 1', 'row label 2',\n",
912+
" 'distance', 'no. of items in clust.'],\n",
913+
" index=['cluster %d' % (i + 1) \n",
914+
" for i in range(row_clusters.shape[0])])"
896915
]
897916
},
898917
{
@@ -965,10 +984,11 @@
965984
"source": [
966985
"# 3. correct approach: Input sample matrix\n",
967986
"\n",
968-
"row_clusters = linkage(df.values, method='complete', metric='euclidean')\n",
969-
"pd.DataFrame(row_clusters, \n",
970-
" columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],\n",
971-
" index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])"
987+
"pd.DataFrame(row_clusters,\n",
988+
" columns=['row label 1', 'row label 2',\n",
989+
" 'distance', 'no. of items in clust.'],\n",
990+
" index=['cluster %d' % (i + 1)\n",
991+
" for i in range(row_clusters.shape[0])])"
972992
]
973993
},
974994
{
@@ -1042,8 +1062,8 @@
10421062
],
10431063
"source": [
10441064
"# plot row dendrogram\n",
1045-
"fig = plt.figure(figsize=(8,8), facecolor='white')\n",
1046-
"axd = fig.add_axes([0.09,0.1,0.2,0.6])\n",
1065+
"fig = plt.figure(figsize=(8, 8), facecolor='white')\n",
1066+
"axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])\n",
10471067
"\n",
10481068
"# note: for matplotlib < v1.5.1, please use orientation='right'\n",
10491069
"row_dendr = dendrogram(row_clusters, orientation='left')\n",
@@ -1059,7 +1079,7 @@
10591079
" i.set_visible(False)\n",
10601080
"\n",
10611081
"# plot heatmap\n",
1062-
"axm = fig.add_axes([0.23,0.1,0.6,0.6]) # x-pos, y-pos, width, height\n",
1082+
"axm = fig.add_axes([0.23, 0.1, 0.6, 0.6]) # x-pos, y-pos, width, height\n",
10631083
"cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')\n",
10641084
"fig.colorbar(cax)\n",
10651085
"axm.set_xticklabels([''] + list(df_rowclust.columns))\n",
@@ -1101,7 +1121,9 @@
11011121
"source": [
11021122
"from sklearn.cluster import AgglomerativeClustering\n",
11031123
"\n",
1104-
"ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete')\n",
1124+
"ac = AgglomerativeClustering(n_clusters=2, \n",
1125+
" affinity='euclidean', \n",
1126+
" linkage='complete')\n",
11051127
"labels = ac.fit_predict(X)\n",
11061128
"print('Cluster labels: %s' % labels)"
11071129
]
@@ -1170,9 +1192,9 @@
11701192
"from sklearn.datasets import make_moons\n",
11711193
"\n",
11721194
"X, y = make_moons(n_samples=200, noise=0.05, random_state=0)\n",
1173-
"plt.scatter(X[:,0], X[:,1])\n",
1195+
"plt.scatter(X[:, 0], X[:, 1])\n",
11741196
"plt.tight_layout()\n",
1175-
"#plt.savefig('./figures/moons.png', dpi=300)\n",
1197+
"# plt.savefig('./figures/moons.png', dpi=300)\n",
11761198
"plt.show()"
11771199
]
11781200
},
@@ -1202,18 +1224,24 @@
12021224
}
12031225
],
12041226
"source": [
1205-
"f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,3))\n",
1227+
"f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))\n",
12061228
"\n",
12071229
"km = KMeans(n_clusters=2, random_state=0)\n",
12081230
"y_km = km.fit_predict(X)\n",
1209-
"ax1.scatter(X[y_km==0,0], X[y_km==0,1], c='lightblue', marker='o', s=40, label='cluster 1')\n",
1210-
"ax1.scatter(X[y_km==1,0], X[y_km==1,1], c='red', marker='s', s=40, label='cluster 2')\n",
1231+
"ax1.scatter(X[y_km == 0, 0], X[y_km == 0, 1],\n",
1232+
" c='lightblue', marker='o', s=40, label='cluster 1')\n",
1233+
"ax1.scatter(X[y_km == 1, 0], X[y_km == 1, 1],\n",
1234+
" c='red', marker='s', s=40, label='cluster 2')\n",
12111235
"ax1.set_title('K-means clustering')\n",
12121236
"\n",
1213-
"ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete')\n",
1237+
"ac = AgglomerativeClustering(n_clusters=2,\n",
1238+
" affinity='euclidean',\n",
1239+
" linkage='complete')\n",
12141240
"y_ac = ac.fit_predict(X)\n",
1215-
"ax2.scatter(X[y_ac==0,0], X[y_ac==0,1], c='lightblue', marker='o', s=40, label='cluster 1')\n",
1216-
"ax2.scatter(X[y_ac==1,0], X[y_ac==1,1], c='red', marker='s', s=40, label='cluster 2')\n",
1241+
"ax2.scatter(X[y_ac == 0, 0], X[y_ac == 0, 1], c='lightblue',\n",
1242+
" marker='o', s=40, label='cluster 1')\n",
1243+
"ax2.scatter(X[y_ac == 1, 0], X[y_ac == 1, 1], c='red',\n",
1244+
" marker='s', s=40, label='cluster 2')\n",
12171245
"ax2.set_title('Agglomerative clustering')\n",
12181246
"\n",
12191247
"plt.legend()\n",
@@ -1252,8 +1280,12 @@
12521280
"\n",
12531281
"db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')\n",
12541282
"y_db = db.fit_predict(X)\n",
1255-
"plt.scatter(X[y_db==0,0], X[y_db==0,1], c='lightblue', marker='o', s=40, label='cluster 1')\n",
1256-
"plt.scatter(X[y_db==1,0], X[y_db==1,1], c='red', marker='s', s=40, label='cluster 2')\n",
1283+
"plt.scatter(X[y_db == 0, 0], X[y_db == 0, 1],\n",
1284+
" c='lightblue', marker='o', s=40,\n",
1285+
" label='cluster 1')\n",
1286+
"plt.scatter(X[y_db == 1, 0], X[y_db == 1, 1],\n",
1287+
" c='red', marker='s', s=40,\n",
1288+
" label='cluster 2')\n",
12571289
"plt.legend()\n",
12581290
"plt.tight_layout()\n",
12591291
"#plt.savefig('./figures/moons_dbscan.png', dpi=300)\n",

0 commit comments

Comments
 (0)