forked from amueller/introduction_to_ml_with_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_agglomerative.py
89 lines (68 loc) · 3.37 KB
/
plot_agglomerative.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import KernelDensity
def plot_agglomerative_algorithm():
# generate synthetic two-dimensional data
X, y = make_blobs(random_state=0, n_samples=12)
agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X)
fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={'xticks': (),
'yticks': ()},
figsize=(20, 8))
eps = X.std() / 2
x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]
for i, ax in enumerate(axes.ravel()):
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
agg.n_clusters = X.shape[0] - i
agg.fit(X)
ax.set_title("Step %d" % i)
ax.scatter(X[:, 0], X[:, 1], s=60, c='grey')
bins = np.bincount(agg.labels_)
for cluster in range(agg.n_clusters):
if bins[cluster] > 1:
points = X[agg.labels_ == cluster]
other_points = X[agg.labels_ != cluster]
kde = KernelDensity(bandwidth=.5).fit(points)
scores = kde.score_samples(gridpoints)
score_inside = np.min(kde.score_samples(points))
score_outside = np.max(kde.score_samples(other_points))
levels = .8 * score_inside + .2 * score_outside
ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels],
colors='k', linestyles='solid', linewidths=2)
axes[0, 0].set_title("Initialization")
def plot_agglomerative():
X, y = make_blobs(random_state=0, n_samples=12)
agg = AgglomerativeClustering(n_clusters=3)
eps = X.std() / 2.
x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]
ax = plt.gca()
for i, x in enumerate(X):
ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center')
ax.scatter(X[:, 0], X[:, 1], s=60, c='grey')
ax.set_xticks(())
ax.set_yticks(())
for i in range(11):
agg.n_clusters = X.shape[0] - i
agg.fit(X)
bins = np.bincount(agg.labels_)
for cluster in range(agg.n_clusters):
if bins[cluster] > 1:
points = X[agg.labels_ == cluster]
other_points = X[agg.labels_ != cluster]
kde = KernelDensity(bandwidth=.5).fit(points)
scores = kde.score_samples(gridpoints)
score_inside = np.min(kde.score_samples(points))
score_outside = np.max(kde.score_samples(other_points))
levels = .8 * score_inside + .2 * score_outside
ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels],
colors='k', linestyles='solid', linewidths=1)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)