pupdate mglearn

amueller · amueller · commit f188c47d8db9 · 2017-02-24T15:02:12.000-05:00
diff --git a/mglearn/plot_2d_separator.py b/mglearn/plot_2d_separator.py
@@ -3,7 +3,8 @@
 from .plot_helpers import cm2, cm3, discrete_scatter
 
 
-def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None, alpha=1, cm=cm3):
+def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None,
+                           alpha=1, cm=cm3):
     # multiclass
     if eps is None:
         eps = X.std() / 2.
@@ -28,7 +29,8 @@ def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None, alpha=1
     ax.set_yticks(())
 
 
-def plot_2d_scores(classifier, X, ax=None, eps=None, alpha=1, cm="viridis", function=None):
+def plot_2d_scores(classifier, X, ax=None, eps=None, alpha=1, cm="viridis",
+                   function=None):
     # binary with fill
     if eps is None:
         eps = X.std() / 2.
@@ -44,7 +46,8 @@ def plot_2d_scores(classifier, X, ax=None, eps=None, alpha=1, cm="viridis", func
     X1, X2 = np.meshgrid(xx, yy)
     X_grid = np.c_[X1.ravel(), X2.ravel()]
     if function is None:
-        function = getattr(classifier, "decision_function", getattr(classifier, "predict_proba"))
+        function = getattr(classifier, "decision_function",
+                           getattr(classifier, "predict_proba"))
     else:
         function = getattr(classifier, function)
     decision_values = function(X_grid)
@@ -63,7 +66,8 @@ def plot_2d_scores(classifier, X, ax=None, eps=None, alpha=1, cm="viridis", func
 
 
 def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1,
-                      cm=cm2, linewidth=None, threshold=None, linestyle="solid"):
+                      cm=cm2, linewidth=None, threshold=None,
+                      linestyle="solid"):
     # binary?
     if eps is None:
         eps = X.std() / 2.
@@ -73,15 +77,16 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1,
 
     x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
     y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
-    xx = np.linspace(x_min, x_max, 100)
-    yy = np.linspace(y_min, y_max, 100)
+    xx = np.linspace(x_min, x_max, 1000)
+    yy = np.linspace(y_min, y_max, 1000)
 
     X1, X2 = np.meshgrid(xx, yy)
     X_grid = np.c_[X1.ravel(), X2.ravel()]
     try:
         decision_values = classifier.decision_function(X_grid)
         levels = [0] if threshold is None else [threshold]
-        fill_levels = [decision_values.min()] + levels + [decision_values.max()]
+        fill_levels = [decision_values.min()] + levels + [
+            decision_values.max()]
     except AttributeError:
         # no decision_function
         decision_values = classifier.predict_proba(X_grid)[:, 1]
diff --git a/mglearn/plot_cross_validation.py b/mglearn/plot_cross_validation.py
@@ -7,7 +7,7 @@ def plot_group_kfold():
     groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
 
     plt.figure(figsize=(10, 2))
-    plt.title("LabelKFold")
+    plt.title("GroupKFold")
 
     axes = plt.gca()
     axes.set_frame_on(False)
diff --git a/mglearn/plot_dbscan.py b/mglearn/plot_dbscan.py
@@ -3,7 +3,7 @@
 from sklearn.cluster import DBSCAN
 from sklearn.datasets import make_blobs
 
-from .plot_helpers import discrete_scatter
+from .plot_helpers import discrete_scatter, cm3
 
 
 def plot_dbscan():
@@ -13,9 +13,10 @@ def plot_dbscan():
     clusters = dbscan.fit_predict(X)
     clusters
 
-    fig, axes = plt.subplots(3, 4, figsize=(11, 8), subplot_kw={'xticks': (), 'yticks': ()})
+    fig, axes = plt.subplots(3, 4, figsize=(11, 8),
+                             subplot_kw={'xticks': (), 'yticks': ()})
     # Plot clusters as red, green and blue, and outliers (-1) as white
-    colors = ['r', 'g', 'b']
+    colors = [cm3(1), cm3(0), cm3(2)]
     markers = ['o', '^', 'v']
 
     # iterate over settings of min_samples and eps
@@ -25,19 +26,22 @@ def plot_dbscan():
             dbscan = DBSCAN(min_samples=min_samples, eps=eps)
             # get cluster assignments
             clusters = dbscan.fit_predict(X)
-            print("min_samples: %d eps: %f  cluster: %s" % (min_samples, eps, clusters))
+            print("min_samples: %d eps: %f  cluster: %s"
+                  % (min_samples, eps, clusters))
             if np.any(clusters == -1):
                 c = ['w'] + colors
                 m = ['o'] + markers
             else:
                 c = colors
                 m = markers
-            discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c, s=8, markers=m)
+            discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c,
+                             s=8, markers=m)
             inds = dbscan.core_sample_indices_
             # vizualize core samples and clusters.
             if len(inds):
                 discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds],
                                  ax=axes[i, j], s=15, c=colors,
                                  markers=markers)
-            axes[i, j].set_title("min_samples: %d eps: %.1f" % (min_samples, eps))
+            axes[i, j].set_title("min_samples: %d eps: %.1f"
+                                 % (min_samples, eps))
     fig.tight_layout()
diff --git a/mglearn/plot_grid_search.py b/mglearn/plot_grid_search.py
@@ -16,26 +16,31 @@ def plot_cross_val_selection():
                   'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
     grid_search = GridSearchCV(SVC(), param_grid, cv=5)
     grid_search.fit(X_trainval, y_trainval)
-    results = pd.DataFrame(grid_search.results_)[15:]
+    results = pd.DataFrame(grid_search.cv_results_)[15:]
 
-    best = np.argmax(results.test_mean_score.values)
+    best = np.argmax(results.mean_test_score.values)
     plt.figure(figsize=(10, 3))
     plt.xlim(-1, len(results))
     plt.ylim(0, 1.1)
     for i, (_, row) in enumerate(results.iterrows()):
         scores = row[['test_split%d_score' % i for i in range(5)]]
-        marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5, alpha=.5)
-        marker_mean, = plt.plot(i, row.test_mean_score, 'v', c='none', alpha=1, markersize=10)
+        marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5,
+                              alpha=.5)
+        marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1,
+                                markersize=10)
         if i == best:
-            marker_best, = plt.plot(i, row.test_mean_score, 'o', c='red', fillstyle="none",
-                                    alpha=1, markersize=20, markeredgewidth=3)
+            marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red',
+                                    fillstyle="none", alpha=1, markersize=20,
+                                    markeredgewidth=3)
 
-    plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "")
-                                     for x in grid_search.results_['params']], rotation=90)
+    plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x
+                                     in grid_search.cv_results_['params']],
+               rotation=90)
     plt.ylabel("Validation accuracy")
     plt.xlabel("Parameter settings")
     plt.legend([marker_cv, marker_mean, marker_best],
-               ["cv accuracy", "mean accuracy", "best parameter setting"], loc=(1.05, .4))
+               ["cv accuracy", "mean accuracy", "best parameter setting"],
+               loc=(1.05, .4))
 
 
 def plot_grid_search_overview():
@@ -54,9 +59,10 @@ def draw(ax, text, start, target=None):
             patchB = None
         annotation = ax.annotate(text, end, start, xycoords='axes pixels',
                                  textcoords='axes pixels', size=20,
-                                 arrowprops=dict(arrowstyle="-|>", fc="w",
-                                                 ec="k", patchB=patchB,
-                                                 connectionstyle="arc3,rad=0.0"),
+                                 arrowprops=dict(
+                                     arrowstyle="-|>", fc="w", ec="k",
+                                     patchB=patchB,
+                                     connectionstyle="arc3,rad=0.0"),
                                  bbox=dict(boxstyle="round", fc="w"),
                                  horizontalalignment="center",
                                  verticalalignment="center")
@@ -66,15 +72,21 @@ def draw(ax, text, start, target=None):
     step = 100
     grr = 400
 
-    final_evaluation = draw(axes, "final evaluation", (5 * step, grr - 3 * step))
-    retrained_model = draw(axes, "retrained model", (3 * step, grr - 3 * step), final_evaluation)
-    best_parameters = draw(axes, "best parameters", (.5 * step, grr - 3 * step), retrained_model)
-    cross_validation = draw(axes, "cross-validation", (.5 * step, grr - 2 * step), best_parameters)
-    parameters = draw(axes, "parameter grid", (0.0, grr - 0), cross_validation)
-    training_data = draw(axes, "training data", (2 * step, grr - step), cross_validation)
+    final_evaluation = draw(axes, "final evaluation", (5 * step, grr - 3 *
+                                                       step))
+    retrained_model = draw(axes, "retrained model", (3 * step, grr - 3 * step),
+                           final_evaluation)
+    best_parameters = draw(axes, "best parameters", (.5 * step, grr - 3 *
+                                                     step), retrained_model)
+    cross_validation = draw(axes, "cross-validation", (.5 * step, grr - 2 *
+                                                       step), best_parameters)
+    draw(axes, "parameter grid", (0.0, grr - 0), cross_validation)
+    training_data = draw(axes, "training data", (2 * step, grr - step),
+                         cross_validation)
     draw(axes, "training data", (2 * step, grr - step), retrained_model)
-    test_data = draw(axes, "test data", (5 * step, grr - step), final_evaluation)
+    test_data = draw(axes, "test data", (5 * step, grr - step),
+                     final_evaluation)
     draw(axes, "data set", (3.5 * step, grr - 0.0), training_data)
-    data_set = draw(axes, "data set", (3.5 * step, grr - 0.0), test_data)
+    draw(axes, "data set", (3.5 * step, grr - 0.0), test_data)
     plt.ylim(0, 1)
     plt.xlim(0, 1.5)
diff --git a/mglearn/plot_helpers.py b/mglearn/plot_helpers.py
@@ -4,7 +4,7 @@
 from matplotlib.colors import ListedColormap, colorConverter, LinearSegmentedColormap
 
 
-cm_cycle = ListedColormap(['#0000aa', '#ff2020', '#50ff50', 'c', '#fff000'])
+cm_cycle = ListedColormap(['#0000aa', '#ff5050', '#50ff50', '#9040a0', '#fff000'])
 cm3 = ListedColormap(['#0000aa', '#ff2020', '#50ff50'])
 cm2 = ListedColormap(['#0000aa', '#ff2020'])
 
diff --git a/mglearn/plot_knn_regression.py b/mglearn/plot_knn_regression.py
@@ -4,7 +4,8 @@
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.metrics import euclidean_distances
 
-from mglearn.datasets import make_wave
+from .datasets import make_wave
+from .plot_helpers import cm3
 
 
 def plot_knn_regression(n_neighbors=1):
@@ -24,12 +25,14 @@ def plot_knn_regression(n_neighbors=1):
                 plt.arrow(x[0], y_, X[neighbor, 0] - x[0], y[neighbor] - y_,
                           head_width=0, fc='k', ec='k')
 
-    train, = plt.plot(X, y, 'o')
-    test, = plt.plot(X_test, -3 * np.ones(len(X_test)), '*', c='g', markersize=20)
-    pred, = plt.plot(X_test, y_pred, '*', c='b', markersize=20)
+    train, = plt.plot(X, y, 'o', c=cm3(0))
+    test, = plt.plot(X_test, -3 * np.ones(len(X_test)), '*', c=cm3(2),
+                     markersize=20)
+    pred, = plt.plot(X_test, y_pred, '*', c=cm3(0), markersize=20)
     plt.vlines(X_test, -3.1, 3.1, linestyle="--")
     plt.legend([train, test, pred],
-               ["training data/target", "test data", "test prediction"], ncol=3, loc=(.1, 1.025))
+               ["training data/target", "test data", "test prediction"],
+               ncol=3, loc=(.1, 1.025))
     plt.ylim(-3.1, 3.1)
     plt.xlabel("Feature")
     plt.ylabel("Target")
diff --git a/mglearn/plot_scaling.py b/mglearn/plot_scaling.py
@@ -1,7 +1,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.datasets import make_blobs
-from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler
+from sklearn.preprocessing import (StandardScaler, MinMaxScaler, Normalizer,
+                                   RobustScaler)
 from .plot_helpers import cm2
 
 
@@ -19,7 +20,8 @@ def plot_scaling():
     main_ax.set_xlim(-maxx + 1, maxx + 1)
     main_ax.set_ylim(-maxy + 1, maxy + 1)
     main_ax.set_title("Original Data")
-    other_axes = [plt.subplot2grid((2, 4), (i, j)) for j in range(2, 4) for i in range(2)]
+    other_axes = [plt.subplot2grid((2, 4), (i, j))
+                  for j in range(2, 4) for i in range(2)]
 
     for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(),
                                        MinMaxScaler(), Normalizer(norm='l2')]):
diff --git a/mglearn/tools.py b/mglearn/tools.py
@@ -2,24 +2,48 @@
 from sklearn.datasets import make_blobs
 from sklearn.tree import export_graphviz
 import matplotlib.pyplot as plt
-from .plot_2d_separator import plot_2d_separator, plot_2d_classification, plot_2d_scores
+from .plot_2d_separator import (plot_2d_separator, plot_2d_classification,
+                                plot_2d_scores)
 from .plot_helpers import cm2 as cm, discrete_scatter
 
 
 def visualize_coefficients(coefficients, feature_names, n_top_features=25):
+    """Visualize coefficients of a linear model.
+
+    Parameters
+    ----------
+    coefficients : nd-array, shape (n_features,)
+        Model coefficients.
+
+    feature_names : list or nd-array of strings, shape (n_features,)
+        Feature names for labeling the coefficients.
+
+    n_top_features : int, default=25
+        How many features to show. The function will show the largest (most
+        positive) and smallest (most negative)  n_top_features coefficients,
+        for a total of 2 * n_top_features coefficients.
+    """
+    if len(coefficients) != len(feature_names):
+        raise ValueError("Number of coefficients {} doesn't match number of"
+                         "feature names {}.".format(len(coefficients),
+                                                    len(feature_names)))
     # get coefficients with large absolute values
     coef = coefficients.ravel()
     positive_coefficients = np.argsort(coef)[-n_top_features:]
     negative_coefficients = np.argsort(coef)[:n_top_features]
-    interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
+    interesting_coefficients = np.hstack([negative_coefficients,
+                                          positive_coefficients])
     # plot them
     plt.figure(figsize=(15, 5))
-    colors = [cm(1) if c < 0 else cm(0) for c in coef[interesting_coefficients]]
-    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)
+    colors = [cm(1) if c < 0 else cm(0)
+              for c in coef[interesting_coefficients]]
+    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients],
+            color=colors)
     feature_names = np.array(feature_names)
     plt.subplots_adjust(bottom=0.3)
     plt.xticks(np.arange(1, 1 + 2 * n_top_features),
-               feature_names[interesting_coefficients], rotation=60, ha="right")
+               feature_names[interesting_coefficients], rotation=60,
+               ha="right")
     plt.ylabel("Coefficient magnitude")
     plt.xlabel("Feature")
 
@@ -39,7 +63,8 @@ def heatmap(values, xlabel, ylabel, xticklabels, yticklabels, cmap=None,
     ax.set_yticklabels(yticklabels)
     ax.set_aspect(1)
 
-    for p, color, value in zip(img.get_paths(), img.get_facecolors(), img.get_array()):
+    for p, color, value in zip(img.get_paths(), img.get_facecolors(),
+                               img.get_array()):
         x, y = p.vertices[:-2, :].mean(0)
         if np.mean(color[:3]) > 0.5:
             c = 'k'
@@ -59,7 +84,8 @@ def make_handcrafted_dataset():
     return X, y
 
 
-def print_topics(topics, feature_names, sorting, topics_per_chunk=6, n_words=20):
+def print_topics(topics, feature_names, sorting, topics_per_chunk=6,
+                 n_words=20):
     for i in range(0, len(topics), topics_per_chunk):
         # for each chunk:
         these_topics = topics[i: i + topics_per_chunk]
@@ -71,7 +97,8 @@ def print_topics(topics, feature_names, sorting, topics_per_chunk=6, n_words=20)
         # print top n_words frequent words
         for i in range(n_words):
             try:
-                print(("{:<14}" * len_this_chunk).format(*feature_names[sorting[these_topics, i]]))
+                print(("{:<14}" * len_this_chunk).format(
+                    *feature_names[sorting[these_topics, i]]))
             except:
                 pass
         print("\n")