diff --git a/mglearn/__init__.py b/mglearn/__init__.py
index 8b2cb09..38be245 100644
--- a/mglearn/__init__.py
+++ b/mglearn/__init__.py
@@ -4,4 +4,6 @@
 from .tools import discrete_scatter
 from .plot_helpers import ReBl
 
+__version__ = "0.2.0"
+
 __all__ = ['tools', 'plots', 'cm3', 'cm2', 'discrete_scatter', 'ReBl']
diff --git a/mglearn/datasets.py b/mglearn/datasets.py
index 961a799..6d29925 100644
--- a/mglearn/datasets.py
+++ b/mglearn/datasets.py
@@ -2,18 +2,18 @@
 import pandas as pd
 import os
 from scipy import signal
-from sklearn.datasets import load_boston
 from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
 from sklearn.datasets import make_blobs
+from sklearn.utils import Bunch
 
-DATA_PATH = os.path.join(os.path.dirname(__file__), "..", "data")
+DATA_PATH = os.path.join(os.path.dirname(__file__), "data")
 
 
 def make_forge():
     # a carefully hand-designed dataset lol
     X, y = make_blobs(centers=2, random_state=4, n_samples=30)
     y[np.array([7, 27])] = 0
-    mask = np.ones(len(X), dtype=np.bool)
+    mask = np.ones(len(X), dtype=bool)
     mask[np.array([0, 1, 5, 26])] = 0
     X, y = X[mask], y[mask]
     return X, y
@@ -27,6 +27,19 @@ def make_wave(n_samples=100):
     return x.reshape(-1, 1), y
 
 
+def load_boston():
+    try:
+        from sklearn.datasets import load_boston
+        return load_boston()
+    except ImportError:
+        pass
+    data_url = "http://lib.stat.cmu.edu/datasets/boston"
+    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
+    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
+    target = raw_df.values[1::2, 2]
+    return Bunch(data=data, target=target)
+
+
 def load_extended_boston():
     boston = load_boston()
     X = boston.data
diff --git a/mglearn/plot_2d_separator.py b/mglearn/plot_2d_separator.py
index 25b549a..943d216 100644
--- a/mglearn/plot_2d_separator.py
+++ b/mglearn/plot_2d_separator.py
@@ -2,34 +2,6 @@
 import matplotlib.pyplot as plt
 from .plot_helpers import cm2, cm3, discrete_scatter
 
-def _call_classifier_chunked(classifier_pred_or_decide, X):
-    # The chunk_size is used to chunk the large arrays to work with x86
-    # memory models that are restricted to < 2 GB in memory allocation. The
-    # chunk_size value used here is based on a measurement with the
-    # MLPClassifier using the following parameters:
-    # MLPClassifier(solver='lbfgs', random_state=0,
-    #               hidden_layer_sizes=[1000,1000,1000])
-    # by reducing the value it is possible to trade in time for memory.
-    # It is possible to chunk the array as the calculations are independent of
-    # each other.
-    # Note: an intermittent version made a distinction between
-    # 32- and 64 bit architectures avoiding the chunking. Testing revealed
-    # that even on 64 bit architectures the chunking increases the
-    # performance by a factor of 3-5, largely due to the avoidance of memory
-    # swapping.
-    chunk_size = 10000
-
-    # We use a list to collect all result chunks
-    Y_result_chunks = []
-
-    # Call the classifier in chunks.
-    for x_chunk in np.array_split(X, np.arange(chunk_size, X.shape[0],
-                                               chunk_size, dtype=np.int32),
-                                  axis=0):
-        Y_result_chunks.append(classifier_pred_or_decide(x_chunk))
-
-    return np.concatenate(Y_result_chunks)
-
 
 def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None,
                            alpha=1, cm=cm3):
@@ -110,16 +82,14 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1,
 
     X1, X2 = np.meshgrid(xx, yy)
     X_grid = np.c_[X1.ravel(), X2.ravel()]
-    if hasattr(classifier, "decision_function"):
-        decision_values = _call_classifier_chunked(classifier.decision_function,
-                                                   X_grid)
+    try:
+        decision_values = classifier.decision_function(X_grid)
         levels = [0] if threshold is None else [threshold]
         fill_levels = [decision_values.min()] + levels + [
             decision_values.max()]
-    else:
+    except AttributeError:
         # no decision_function
-        decision_values = _call_classifier_chunked(classifier.predict_proba,
-                                                   X_grid)[:, 1]
+        decision_values = classifier.predict_proba(X_grid)[:, 1]
         levels = [.5] if threshold is None else [threshold]
         fill_levels = [0] + levels + [1]
     if fill:
@@ -133,14 +103,4 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1,
     ax.set_xlim(x_min, x_max)
     ax.set_ylim(y_min, y_max)
     ax.set_xticks(())
-    ax.set_yticks(())
-
-
-if __name__ == '__main__':
-    from sklearn.datasets import make_blobs
-    from sklearn.linear_model import LogisticRegression
-    X, y = make_blobs(centers=2, random_state=42)
-    clf = LogisticRegression(solver='lbfgs').fit(X, y)
-    plot_2d_separator(clf, X, fill=True)
-    discrete_scatter(X[:, 0], X[:, 1], y)
-    plt.show()
+    ax.set_yticks(())
\ No newline at end of file
diff --git a/mglearn/plot_grid_search.py b/mglearn/plot_grid_search.py
index fe805d5..884bae7 100644
--- a/mglearn/plot_grid_search.py
+++ b/mglearn/plot_grid_search.py
@@ -33,9 +33,8 @@ def plot_cross_val_selection():
             marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red',
                                     fillstyle="none", alpha=1, markersize=20,
                                     markeredgewidth=3)
-
     plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x
-                                     in grid_search.cv_results_['params']],
+                                     in results['params']],
                rotation=90)
     plt.ylabel("Validation accuracy")
     plt.xlabel("Parameter settings")
diff --git a/mglearn/plot_kneighbors_regularization.py b/mglearn/plot_kneighbors_regularization.py
index 825dacf..71b1e94 100644
--- a/mglearn/plot_kneighbors_regularization.py
+++ b/mglearn/plot_kneighbors_regularization.py
@@ -22,9 +22,4 @@ def plot_kneighbors_regularization():
         ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]),
                 label="prediction")
         ax.legend()
-        ax.set_title("n_neighbors = %d" % n_neighbors)
-
-
-if __name__ == "__main__":
-    plot_kneighbors_regularization()
-    plt.show()
+        ax.set_title("n_neighbors = %d" % n_neighbors)
\ No newline at end of file
diff --git a/mglearn/plot_linear_svc_regularization.py b/mglearn/plot_linear_svc_regularization.py
index 5f8e0dc..3ee92b9 100644
--- a/mglearn/plot_linear_svc_regularization.py
+++ b/mglearn/plot_linear_svc_regularization.py
@@ -31,7 +31,3 @@ def plot_linear_svc_regularization():
         ax.set_yticks(())
         ax.set_title("C = %f" % C)
     axes[0].legend(loc="best")
-
-if __name__ == "__main__":
-    plot_linear_svc_regularization()
-    plt.show()
diff --git a/mglearn/plot_nmf.py b/mglearn/plot_nmf.py
index dbea642..2651f95 100644
--- a/mglearn/plot_nmf.py
+++ b/mglearn/plot_nmf.py
@@ -4,7 +4,11 @@
 
 from joblib import Memory
 
-memory = Memory(location="cache")
+try:
+    memory = Memory(cachedir="cache")
+except TypeError:
+    # joblib.Memory changed its API in 0.12
+    memory = Memory(location="cache", verbose=0)
 
 
 def plot_nmf_illustration():
diff --git a/mglearn/plot_pca.py b/mglearn/plot_pca.py
index 25ce634..65a9ded 100644
--- a/mglearn/plot_pca.py
+++ b/mglearn/plot_pca.py
@@ -4,8 +4,11 @@
 
 from joblib import Memory
 
-memory = Memory(location="cache")
-
+try:
+    memory = Memory(cachedir="cache")
+except TypeError:
+    # joblib.Memory changed its API in 0.12
+    memory = Memory(location="cache", verbose=0)
 
 def plot_pca_illustration():
     rnd = np.random.RandomState(5)
diff --git a/mglearn/plots.py b/mglearn/plots.py
index 4dcb507..b275a3f 100644
--- a/mglearn/plots.py
+++ b/mglearn/plots.py
@@ -28,6 +28,7 @@
                            plot_decision_threshold)
 from .plot_dbscan import plot_dbscan
 from .plot_ridge import plot_ridge_n_samples
+from .plot_kneighbors_regularization import plot_kneighbors_regularization
 
 __all__ = ['plot_linear_svc_regularization',
            "plot_animal_tree", "plot_tree_progressive",
@@ -65,5 +66,6 @@
            'plot_binary_confusion_matrix',
            'plot_decision_threshold',
            'plot_dbscan',
-           'plot_ridge_n_samples'
+           'plot_ridge_n_samples',
+           'plot_kneighbors_regularization'
            ]
diff --git a/mglearn/tools.py b/mglearn/tools.py
index b67686e..f6b7dd1 100644
--- a/mglearn/tools.py
+++ b/mglearn/tools.py
@@ -85,7 +85,7 @@ def make_handcrafted_dataset():
     # a carefully hand-designed dataset lol
     X, y = make_blobs(centers=2, random_state=4, n_samples=30)
     y[np.array([7, 27])] = 0
-    mask = np.ones(len(X), dtype=np.bool)
+    mask = np.ones(len(X), dtype=bool)
     mask[np.array([0, 1, 5, 26])] = 0
     X, y = X[mask], y[mask]
     return X, y