From ea60cf6cf791553b6cca7cf31802c68cb3798ebb Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Wed, 13 Mar 2024 19:45:57 -0700 Subject: [PATCH] get updates from mglearn repo --- mglearn/__init__.py | 2 + mglearn/datasets.py | 19 +++++++-- mglearn/plot_2d_separator.py | 50 +++-------------------- mglearn/plot_grid_search.py | 3 +- mglearn/plot_kneighbors_regularization.py | 7 +--- mglearn/plot_linear_svc_regularization.py | 4 -- mglearn/plot_nmf.py | 6 ++- mglearn/plot_pca.py | 7 +++- mglearn/plots.py | 4 +- mglearn/tools.py | 2 +- 10 files changed, 39 insertions(+), 65 deletions(-) diff --git a/mglearn/__init__.py b/mglearn/__init__.py index 8b2cb09..38be245 100644 --- a/mglearn/__init__.py +++ b/mglearn/__init__.py @@ -4,4 +4,6 @@ from .tools import discrete_scatter from .plot_helpers import ReBl +__version__ = "0.2.0" + __all__ = ['tools', 'plots', 'cm3', 'cm2', 'discrete_scatter', 'ReBl'] diff --git a/mglearn/datasets.py b/mglearn/datasets.py index 961a799..6d29925 100644 --- a/mglearn/datasets.py +++ b/mglearn/datasets.py @@ -2,18 +2,18 @@ import pandas as pd import os from scipy import signal -from sklearn.datasets import load_boston from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures from sklearn.datasets import make_blobs +from sklearn.utils import Bunch -DATA_PATH = os.path.join(os.path.dirname(__file__), "..", "data") +DATA_PATH = os.path.join(os.path.dirname(__file__), "data") def make_forge(): # a carefully hand-designed dataset lol X, y = make_blobs(centers=2, random_state=4, n_samples=30) y[np.array([7, 27])] = 0 - mask = np.ones(len(X), dtype=np.bool) + mask = np.ones(len(X), dtype=bool) mask[np.array([0, 1, 5, 26])] = 0 X, y = X[mask], y[mask] return X, y @@ -27,6 +27,19 @@ def make_wave(n_samples=100): return x.reshape(-1, 1), y +def load_boston(): + try: + from sklearn.datasets import load_boston + return load_boston() + except ImportError: + pass + data_url = "http://lib.stat.cmu.edu/datasets/boston" + raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) + data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) + target = raw_df.values[1::2, 2] + return Bunch(data=data, target=target) + + def load_extended_boston(): boston = load_boston() X = boston.data diff --git a/mglearn/plot_2d_separator.py b/mglearn/plot_2d_separator.py index 25b549a..943d216 100644 --- a/mglearn/plot_2d_separator.py +++ b/mglearn/plot_2d_separator.py @@ -2,34 +2,6 @@ import matplotlib.pyplot as plt from .plot_helpers import cm2, cm3, discrete_scatter -def _call_classifier_chunked(classifier_pred_or_decide, X): - # The chunk_size is used to chunk the large arrays to work with x86 - # memory models that are restricted to < 2 GB in memory allocation. The - # chunk_size value used here is based on a measurement with the - # MLPClassifier using the following parameters: - # MLPClassifier(solver='lbfgs', random_state=0, - # hidden_layer_sizes=[1000,1000,1000]) - # by reducing the value it is possible to trade in time for memory. - # It is possible to chunk the array as the calculations are independent of - # each other. - # Note: an intermittent version made a distinction between - # 32- and 64 bit architectures avoiding the chunking. Testing revealed - # that even on 64 bit architectures the chunking increases the - # performance by a factor of 3-5, largely due to the avoidance of memory - # swapping. - chunk_size = 10000 - - # We use a list to collect all result chunks - Y_result_chunks = [] - - # Call the classifier in chunks. - for x_chunk in np.array_split(X, np.arange(chunk_size, X.shape[0], - chunk_size, dtype=np.int32), - axis=0): - Y_result_chunks.append(classifier_pred_or_decide(x_chunk)) - - return np.concatenate(Y_result_chunks) - def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None, alpha=1, cm=cm3): @@ -110,16 +82,14 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1, X1, X2 = np.meshgrid(xx, yy) X_grid = np.c_[X1.ravel(), X2.ravel()] - if hasattr(classifier, "decision_function"): - decision_values = _call_classifier_chunked(classifier.decision_function, - X_grid) + try: + decision_values = classifier.decision_function(X_grid) levels = [0] if threshold is None else [threshold] fill_levels = [decision_values.min()] + levels + [ decision_values.max()] - else: + except AttributeError: # no decision_function - decision_values = _call_classifier_chunked(classifier.predict_proba, - X_grid)[:, 1] + decision_values = classifier.predict_proba(X_grid)[:, 1] levels = [.5] if threshold is None else [threshold] fill_levels = [0] + levels + [1] if fill: @@ -133,14 +103,4 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1, ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xticks(()) - ax.set_yticks(()) - - -if __name__ == '__main__': - from sklearn.datasets import make_blobs - from sklearn.linear_model import LogisticRegression - X, y = make_blobs(centers=2, random_state=42) - clf = LogisticRegression(solver='lbfgs').fit(X, y) - plot_2d_separator(clf, X, fill=True) - discrete_scatter(X[:, 0], X[:, 1], y) - plt.show() + ax.set_yticks(()) \ No newline at end of file diff --git a/mglearn/plot_grid_search.py b/mglearn/plot_grid_search.py index fe805d5..884bae7 100644 --- a/mglearn/plot_grid_search.py +++ b/mglearn/plot_grid_search.py @@ -33,9 +33,8 @@ def plot_cross_val_selection(): marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red', fillstyle="none", alpha=1, markersize=20, markeredgewidth=3) - plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x - in grid_search.cv_results_['params']], + in results['params']], rotation=90) plt.ylabel("Validation accuracy") plt.xlabel("Parameter settings") diff --git a/mglearn/plot_kneighbors_regularization.py b/mglearn/plot_kneighbors_regularization.py index 825dacf..71b1e94 100644 --- a/mglearn/plot_kneighbors_regularization.py +++ b/mglearn/plot_kneighbors_regularization.py @@ -22,9 +22,4 @@ def plot_kneighbors_regularization(): ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]), label="prediction") ax.legend() - ax.set_title("n_neighbors = %d" % n_neighbors) - - -if __name__ == "__main__": - plot_kneighbors_regularization() - plt.show() + ax.set_title("n_neighbors = %d" % n_neighbors) \ No newline at end of file diff --git a/mglearn/plot_linear_svc_regularization.py b/mglearn/plot_linear_svc_regularization.py index 5f8e0dc..3ee92b9 100644 --- a/mglearn/plot_linear_svc_regularization.py +++ b/mglearn/plot_linear_svc_regularization.py @@ -31,7 +31,3 @@ def plot_linear_svc_regularization(): ax.set_yticks(()) ax.set_title("C = %f" % C) axes[0].legend(loc="best") - -if __name__ == "__main__": - plot_linear_svc_regularization() - plt.show() diff --git a/mglearn/plot_nmf.py b/mglearn/plot_nmf.py index dbea642..2651f95 100644 --- a/mglearn/plot_nmf.py +++ b/mglearn/plot_nmf.py @@ -4,7 +4,11 @@ from joblib import Memory -memory = Memory(location="cache") +try: + memory = Memory(cachedir="cache") +except TypeError: + # joblib.Memory changed its API in 0.12 + memory = Memory(location="cache", verbose=0) def plot_nmf_illustration(): diff --git a/mglearn/plot_pca.py b/mglearn/plot_pca.py index 25ce634..65a9ded 100644 --- a/mglearn/plot_pca.py +++ b/mglearn/plot_pca.py @@ -4,8 +4,11 @@ from joblib import Memory -memory = Memory(location="cache") - +try: + memory = Memory(cachedir="cache") +except TypeError: + # joblib.Memory changed its API in 0.12 + memory = Memory(location="cache", verbose=0) def plot_pca_illustration(): rnd = np.random.RandomState(5) diff --git a/mglearn/plots.py b/mglearn/plots.py index 4dcb507..b275a3f 100644 --- a/mglearn/plots.py +++ b/mglearn/plots.py @@ -28,6 +28,7 @@ plot_decision_threshold) from .plot_dbscan import plot_dbscan from .plot_ridge import plot_ridge_n_samples +from .plot_kneighbors_regularization import plot_kneighbors_regularization __all__ = ['plot_linear_svc_regularization', "plot_animal_tree", "plot_tree_progressive", @@ -65,5 +66,6 @@ 'plot_binary_confusion_matrix', 'plot_decision_threshold', 'plot_dbscan', - 'plot_ridge_n_samples' + 'plot_ridge_n_samples', + 'plot_kneighbors_regularization' ] diff --git a/mglearn/tools.py b/mglearn/tools.py index b67686e..f6b7dd1 100644 --- a/mglearn/tools.py +++ b/mglearn/tools.py @@ -85,7 +85,7 @@ def make_handcrafted_dataset(): # a carefully hand-designed dataset lol X, y = make_blobs(centers=2, random_state=4, n_samples=30) y[np.array([7, 27])] = 0 - mask = np.ones(len(X), dtype=np.bool) + mask = np.ones(len(X), dtype=bool) mask[np.array([0, 1, 5, 26])] = 0 X, y = X[mask], y[mask] return X, y