fixes for matplotlib v2 in cross-validation, also some minor fixes otherwise

amueller · amueller · commit f8a5bf9b4f87 · 2017-02-24T15:02:12.000-05:00
diff --git a/05-model-evaluation-and-improvement.ipynb b/05-model-evaluation-and-improvement.ipynb
@@ -2832,9 +2832,9 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python [Root]",
+   "display_name": "Python [conda root]",
    "language": "python",
-   "name": "Python [Root]"
+   "name": "conda-root-py"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/07-working-with-text-data.ipynb b/07-working-with-text-data.ipynb
@@ -1416,9 +1416,9 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python [Root]",
+   "display_name": "Python [conda root]",
    "language": "python",
-   "name": "Python [Root]"
+   "name": "conda-root-py"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/mglearn/plot_cross_validation.py b/mglearn/plot_cross_validation.py
@@ -23,39 +23,43 @@ def plot_group_kfold():
         mask[i, train] = 1
         mask[i, test] = 2
 
-
     for i in range(n_folds):
         # test is grey
-        colors = ["grey"  if x == 2 else "white" for x in mask[:, i]]
+        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
         # not selected has no hatch
-        
-        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//")
+
+        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter,
+                          left=i * n_samples_per_fold, height=.6, color=colors,
+                          hatch="//", edgecolor="k")
         for j in np.where(mask[:, i] == 0)[0]:
             boxes[j].set_hatch("")
-            
-    axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w")
+
+    axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds,
+              left=np.arange(n_folds) * n_samples_per_fold, height=.6,
+              color="w", edgecolor='k')
 
     for i in range(12):
-        axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % groups[i], horizontalalignment="center")
-    #ax.set_ylim(4, -0.1)
-            
+        axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" %
+                  groups[i], horizontalalignment="center")
+
     axes.invert_yaxis()
     axes.set_xlim(0, n_samples + 1)
     axes.set_ylabel("CV iterations")
     axes.set_xlabel("Data points")
     axes.set_xticks(np.arange(n_samples) + .5)
     axes.set_xticklabels(np.arange(1, n_samples + 1))
     axes.set_yticks(np.arange(n_iter + 1) + .3)
-    axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"]);
-    plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3));
+    axes.set_yticklabels(
+        ["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"])
+    plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3))
     plt.tight_layout()
 
 
-
 def plot_shuffle_split():
     from sklearn.model_selection import ShuffleSplit
     plt.figure(figsize=(10, 2))
-    plt.title("ShuffleSplit with 10 points, train_size=5, test_size=2, n_splits=4")
+    plt.title("ShuffleSplit with 10 points"
+              ", train_size=5, test_size=2, n_splits=4")
 
     axes = plt.gca()
     axes.set_frame_on(False)
@@ -71,13 +75,14 @@ def plot_shuffle_split():
         mask[i, train] = 1
         mask[i, test] = 2
 
-
     for i in range(n_folds):
         # test is grey
-        colors = ["grey"  if x == 2 else "white" for x in mask[:, i]]
+        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
         # not selected has no hatch
-        
-        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//")
+
+        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter,
+                          left=i * n_samples_per_fold, height=.6, color=colors,
+                          hatch="//", edgecolor='k')
         for j in np.where(mask[:, i] == 0)[0]:
             boxes[j].set_hatch("")
 
@@ -88,17 +93,16 @@ def plot_shuffle_split():
     axes.set_xticks(np.arange(n_samples) + .5)
     axes.set_xticklabels(np.arange(1, n_samples + 1))
     axes.set_yticks(np.arange(n_iter) + .3)
-    axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)]);
+    axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)])
     # legend hacked for this random state
-    plt.legend([boxes[1], boxes[0], boxes[2]], ["Training set", "Test set", "Not selected"], loc=(1, .3));
+    plt.legend([boxes[1], boxes[0], boxes[2]], [
+               "Training set", "Test set", "Not selected"], loc=(1, .3))
     plt.tight_layout()
-    plt.savefig("images/06_shuffle_split.png")
-    plt.close()
 
 
 def plot_stratified_cross_validation():
     fig, both_axes = plt.subplots(2, 1, figsize=(12, 5))
-    #plt.title("cross_validation_not_stratified")
+    # plt.title("cross_validation_not_stratified")
     axes = both_axes[0]
     axes.set_title("Standard cross-validation with sorted class labels")
 
@@ -109,25 +113,30 @@ def plot_stratified_cross_validation():
 
     n_samples_per_fold = n_samples / float(n_folds)
 
-
     for i in range(n_folds):
         colors = ["w"] * n_folds
         colors[i] = "grey"
-        axes.barh(bottom=range(n_folds), width=[n_samples_per_fold - 1] * n_folds, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//")
-        
-    axes.barh(bottom=[n_folds] * n_folds, width=[n_samples_per_fold - 1] * n_folds, left=np.arange(3) * n_samples_per_fold, height=.6, color="w")
+        axes.barh(bottom=range(n_folds), width=[n_samples_per_fold - 1] *
+                  n_folds, left=i * n_samples_per_fold, height=.6,
+                  color=colors, hatch="//", edgecolor='k')
+
+    axes.barh(bottom=[n_folds] * n_folds, width=[n_samples_per_fold - 1] *
+              n_folds, left=np.arange(3) * n_samples_per_fold, height=.6,
+              color="w", edgecolor='k')
 
     axes.invert_yaxis()
     axes.set_xlim(0, n_samples + 1)
     axes.set_ylabel("CV iterations")
     axes.set_xlabel("Data points")
-    axes.set_xticks(np.arange(n_samples_per_fold / 2., n_samples, n_samples_per_fold))
+    axes.set_xticks(np.arange(n_samples_per_fold / 2.,
+                              n_samples, n_samples_per_fold))
     axes.set_xticklabels(["Fold %d" % x for x in range(1, n_folds + 1)])
     axes.set_yticks(np.arange(n_folds + 1) + .3)
-    axes.set_yticklabels(["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"])
+    axes.set_yticklabels(
+        ["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"])
     for i in range(3):
-        axes.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" % i, horizontalalignment="center")
-
+        axes.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" %
+                  i, horizontalalignment="center")
 
     ax = both_axes[1]
     ax.set_title("Stratified Cross-validation")
@@ -138,24 +147,38 @@ def plot_stratified_cross_validation():
     ax.set_xlabel("Data points")
 
     ax.set_yticks(np.arange(n_folds + 1) + .3)
-    ax.set_yticklabels(["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"]);
+    ax.set_yticklabels(
+        ["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"])
 
     n_subsplit = n_samples_per_fold / 3.
     for i in range(n_folds):
-        test_bars = ax.barh(bottom=[i] * n_folds, width=[n_subsplit - 1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold + i * n_subsplit, height=.6, color="grey", hatch="//")
+        test_bars = ax.barh(
+            bottom=[i] * n_folds, width=[n_subsplit - 1] * n_folds,
+            left=np.arange(n_folds) * n_samples_per_fold + i * n_subsplit,
+            height=.6, color="grey", hatch="//", edgecolor='k')
 
     w = 2 * n_subsplit - 1
-    ax.barh(bottom=[0] * n_folds, width=[w] * n_folds, left=np.arange(n_folds) * n_samples_per_fold + (0 + 1) * n_subsplit, height=.6, color="w", hatch="//")
-    ax.barh(bottom=[1] * (n_folds + 1), width=[w / 2., w, w, w / 2.], left=np.maximum(0, np.arange(n_folds + 1) * n_samples_per_fold - n_subsplit), height=.6, color="w", hatch="//")
-    training_bars = ax.barh(bottom=[2] * n_folds, width=[w] * n_folds, left=np.arange(n_folds) * n_samples_per_fold , height=.6, color="w", hatch="//")
-
-        
-    ax.barh(bottom=[n_folds] * n_folds, width=[n_samples_per_fold - 1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w")
+    ax.barh(bottom=[0] * n_folds, width=[w] * n_folds, left=np.arange(n_folds)
+            * n_samples_per_fold + (0 + 1) * n_subsplit, height=.6, color="w",
+            hatch="//", edgecolor='k')
+    ax.barh(bottom=[1] * (n_folds + 1), width=[w / 2., w, w, w / 2.],
+            left=np.maximum(0, np.arange(n_folds + 1) * n_samples_per_fold -
+                            n_subsplit), height=.6, color="w", hatch="//",
+            edgecolor='k')
+    training_bars = ax.barh(bottom=[2] * n_folds, width=[w] * n_folds,
+                            left=np.arange(n_folds) * n_samples_per_fold,
+                            height=.6, color="w", hatch="//", edgecolor='k')
+
+    ax.barh(bottom=[n_folds] * n_folds, width=[n_samples_per_fold - 1] *
+            n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6,
+            color="w", edgecolor='k')
 
     for i in range(3):
-        ax.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" % i, horizontalalignment="center")
+        ax.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" %
+                i, horizontalalignment="center")
     ax.set_ylim(4, -0.1)
-    plt.legend([training_bars[0], test_bars[0]], ['Training data', 'Test data'], loc=(1.05, 1), frameon=False);
+    plt.legend([training_bars[0], test_bars[0]], [
+               'Training data', 'Test data'], loc=(1.05, 1), frameon=False)
 
     fig.tight_layout()
 
@@ -171,33 +194,43 @@ def plot_cross_validation():
 
     n_samples_per_fold = n_samples / float(n_folds)
 
-
     for i in range(n_folds):
         colors = ["w"] * n_folds
         colors[i] = "grey"
-        bars = plt.barh(bottom=range(n_folds), width=[n_samples_per_fold - 0.1] * n_folds,
-                 left=i * n_samples_per_fold, height=.6, color=colors, hatch="//")
+        bars = plt.barh(
+            bottom=range(n_folds), width=[n_samples_per_fold - 0.1] * n_folds,
+            left=i * n_samples_per_fold, height=.6, color=colors, hatch="//",
+            edgecolor='k')
     axes.invert_yaxis()
     axes.set_xlim(0, n_samples + 1)
     plt.ylabel("CV iterations")
     plt.xlabel("Data points")
-    plt.xticks(np.arange(n_samples_per_fold / 2., n_samples, n_samples_per_fold), ["Fold %d" % x for x in range(1, n_folds + 1)])
-    plt.yticks(np.arange(n_folds) + .3, ["Split %d" % x for x in range(1, n_folds + 1)])
-    plt.legend([bars[0], bars[4]], ['Training data', 'Test data'], loc=(1.05, 0.4), frameon=False);
+    plt.xticks(np.arange(n_samples_per_fold / 2., n_samples,
+                         n_samples_per_fold),
+               ["Fold %d" % x for x in range(1, n_folds + 1)])
+    plt.yticks(np.arange(n_folds) + .3,
+               ["Split %d" % x for x in range(1, n_folds + 1)])
+    plt.legend([bars[0], bars[4]], ['Training data', 'Test data'],
+               loc=(1.05, 0.4), frameon=False)
 
 
 def plot_threefold_split():
     plt.figure(figsize=(15, 1))
     axis = plt.gca()
-    bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], left=[0, 12, 15], color=['white', 'grey', 'grey'], hatch="//")
+    bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], left=[0, 12, 15], color=[
+                     'white', 'grey', 'grey'], hatch="//", edgecolor='k')
     bars[2].set_hatch(r"")
     axis.set_yticks(())
     axis.set_frame_on(False)
     axis.set_ylim(-.1, .8)
     axis.set_xlim(-0.1, 20.1)
     axis.set_xticks([6, 13.3, 17.5])
-    axis.set_xticklabels(["training set", "validation set", "test set"], fontdict={'fontsize': 20});
+    axis.set_xticklabels(["training set", "validation set",
+                          "test set"], fontdict={'fontsize': 20})
     axis.tick_params(length=0, labeltop=True, labelbottom=False)
-    axis.text(6, -.3, "Model fitting", fontdict={'fontsize': 13}, horizontalalignment="center")
-    axis.text(13.3, -.3, "Parameter selection", fontdict={'fontsize': 13}, horizontalalignment="center")
-    axis.text(17.5, -.3, "Evaluation", fontdict={'fontsize': 13}, horizontalalignment="center")
+    axis.text(6, -.3, "Model fitting",
+              fontdict={'fontsize': 13}, horizontalalignment="center")
+    axis.text(13.3, -.3, "Parameter selection",
+              fontdict={'fontsize': 13}, horizontalalignment="center")
+    axis.text(17.5, -.3, "Evaluation",
+              fontdict={'fontsize': 13}, horizontalalignment="center")
diff --git a/mglearn/plot_grid_search.py b/mglearn/plot_grid_search.py
@@ -23,11 +23,11 @@ def plot_cross_val_selection():
     plt.xlim(-1, len(results))
     plt.ylim(0, 1.1)
     for i, (_, row) in enumerate(results.iterrows()):
-        scores = row[['test_split%d_score' % i for i in range(5)]]
+        scores = row[['test_split%d_test_score' % i for i in range(5)]]
         marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5,
                               alpha=.5)
         marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1,
-                                markersize=10)
+                                markersize=10, markeredgecolor='k')
         if i == best:
             marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red',
                                     fillstyle="none", alpha=1, markersize=20,
@@ -44,7 +44,7 @@ def plot_cross_val_selection():
 
 
 def plot_grid_search_overview():
-    plt.figure(figsize=(10, 3))
+    plt.figure(figsize=(10, 3), dpi=70)
     axes = plt.gca()
     axes.yaxis.set_visible(False)
     axes.xaxis.set_visible(False)