import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import KernelDensity def plot_agglomerative_algorithm(): # generate synthetic two-dimensional data X, y = make_blobs(random_state=0, n_samples=12) agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X) fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(20, 8)) eps = X.std() / 2 x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] for i, ax in enumerate(axes.ravel()): ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) agg.n_clusters = X.shape[0] - i agg.fit(X) ax.set_title("Step %d" % i) ax.scatter(X[:, 0], X[:, 1], s=60, c='grey') bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth=.5).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = .8 * score_inside + .2 * score_outside ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], colors='k', linestyles='solid', linewidths=2) axes[0, 0].set_title("Initialization") def plot_agglomerative(): X, y = make_blobs(random_state=0, n_samples=12) agg = AgglomerativeClustering(n_clusters=3) eps = X.std() / 2. x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] ax = plt.gca() for i, x in enumerate(X): ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center') ax.scatter(X[:, 0], X[:, 1], s=60, c='grey') ax.set_xticks(()) ax.set_yticks(()) for i in range(11): agg.n_clusters = X.shape[0] - i agg.fit(X) bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth=.5).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = .8 * score_inside + .2 * score_outside ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], colors='k', linestyles='solid', linewidths=1) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max)