From c194b8901bf151eb5b329629df25d93098896d3a Mon Sep 17 00:00:00 2001 From: behreth <behreth@gmail.com> Date: Tue, 11 Dec 2018 22:16:12 +0100 Subject: [PATCH 1/2] This closes issue amueller/introduction_to_ml_with_python#67 memory error on 32bit Python Main change: - Created chunking logic to call the classifier with a maximum number of tests (detailed description as code comment). In addition the following change was made: - Replaced the try/catch with an explicit check for the available function either decision_function or predict_proba. --- mglearn/plot_2d_separator.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/mglearn/plot_2d_separator.py b/mglearn/plot_2d_separator.py index 9b32028..e9ed0b9 100644 --- a/mglearn/plot_2d_separator.py +++ b/mglearn/plot_2d_separator.py @@ -2,6 +2,34 @@ import matplotlib.pyplot as plt from .plot_helpers import cm2, cm3, discrete_scatter +def _call_classifier_chunked(classifier_pred_or_decide, X): + + + # The chunk_size is used to chunk the large arrays to work with x86 memory + # models that are restricted to < 2 GB in memory allocation. + # The chunk_size value used here is based on a measurement with the MLPClassifier + # using the following parameters: + # MLPClassifier(solver='lbfgs', random_state=0, hidden_layer_sizes=[1000,1000,1000]) + # by reducing the value it is possible to trade in time for memory. + # It is possible to chunk the array as the calculations are independent of each other. + # Note: an intermittent version made a distinction between 32- and 64 bit architectures + # avoiding the chunking. Testing revealed that even on 64 bit architectures the chunking + # increases the performance by a factor of 3-5, largely due to the avoidance of memory + # swapping. + chunk_size = 10000 + X_axis0_size = X.shape[0] + + # We use a list to collect all result chunks + Y_result_chunks = [] + + # Call the classifier in chunks. + y_chunk_pos = 0 + for x_chunk in np.array_split(X, np.arange(chunk_size,X_axis0_size,chunk_size,dtype=np.int32), axis=0): + Y_result_chunks.append(classifier_pred_or_decide(x_chunk)) + y_chunk_pos += x_chunk.shape[0] + + return np.concatenate(Y_result_chunks) + def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None, alpha=1, cm=cm3): @@ -82,14 +110,14 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1, X1, X2 = np.meshgrid(xx, yy) X_grid = np.c_[X1.ravel(), X2.ravel()] - try: - decision_values = classifier.decision_function(X_grid) + if hasattr(classifier, "decision_function"): + decision_values = _call_classifier_chunked(classifier.decision_function, X_grid) levels = [0] if threshold is None else [threshold] fill_levels = [decision_values.min()] + levels + [ decision_values.max()] - except AttributeError: + else: # no decision_function - decision_values = classifier.predict_proba(X_grid)[:, 1] + decision_values = _call_classifier_chunked(classifier.predict_proba, X_grid)[:, 1] levels = [.5] if threshold is None else [threshold] fill_levels = [0] + levels + [1] if fill: From bb34a1f7f3008f3ec8e3a2da79fd53189a17cb63 Mon Sep 17 00:00:00 2001 From: behreth <behreth@gmail.com> Date: Thu, 13 Dec 2018 14:26:59 +0100 Subject: [PATCH 2/2] This closes issue amueller/introduction_to_ml_with_python#67 memory error on 32bit Python Minor refinements due PR feedback - Removed unnecessary variables and inlined one-time used variable. - Re-introduced the originally intended solver solver='lbfgs' - Adhered to PEP8, breaking lines and comments accordingly --- mglearn/plot_2d_separator.py | 38 +++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/mglearn/plot_2d_separator.py b/mglearn/plot_2d_separator.py index e9ed0b9..25b549a 100644 --- a/mglearn/plot_2d_separator.py +++ b/mglearn/plot_2d_separator.py @@ -3,30 +3,30 @@ from .plot_helpers import cm2, cm3, discrete_scatter def _call_classifier_chunked(classifier_pred_or_decide, X): - - - # The chunk_size is used to chunk the large arrays to work with x86 memory - # models that are restricted to < 2 GB in memory allocation. - # The chunk_size value used here is based on a measurement with the MLPClassifier - # using the following parameters: - # MLPClassifier(solver='lbfgs', random_state=0, hidden_layer_sizes=[1000,1000,1000]) + # The chunk_size is used to chunk the large arrays to work with x86 + # memory models that are restricted to < 2 GB in memory allocation. The + # chunk_size value used here is based on a measurement with the + # MLPClassifier using the following parameters: + # MLPClassifier(solver='lbfgs', random_state=0, + # hidden_layer_sizes=[1000,1000,1000]) # by reducing the value it is possible to trade in time for memory. - # It is possible to chunk the array as the calculations are independent of each other. - # Note: an intermittent version made a distinction between 32- and 64 bit architectures - # avoiding the chunking. Testing revealed that even on 64 bit architectures the chunking - # increases the performance by a factor of 3-5, largely due to the avoidance of memory + # It is possible to chunk the array as the calculations are independent of + # each other. + # Note: an intermittent version made a distinction between + # 32- and 64 bit architectures avoiding the chunking. Testing revealed + # that even on 64 bit architectures the chunking increases the + # performance by a factor of 3-5, largely due to the avoidance of memory # swapping. chunk_size = 10000 - X_axis0_size = X.shape[0] # We use a list to collect all result chunks Y_result_chunks = [] # Call the classifier in chunks. - y_chunk_pos = 0 - for x_chunk in np.array_split(X, np.arange(chunk_size,X_axis0_size,chunk_size,dtype=np.int32), axis=0): + for x_chunk in np.array_split(X, np.arange(chunk_size, X.shape[0], + chunk_size, dtype=np.int32), + axis=0): Y_result_chunks.append(classifier_pred_or_decide(x_chunk)) - y_chunk_pos += x_chunk.shape[0] return np.concatenate(Y_result_chunks) @@ -111,13 +111,15 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1, X1, X2 = np.meshgrid(xx, yy) X_grid = np.c_[X1.ravel(), X2.ravel()] if hasattr(classifier, "decision_function"): - decision_values = _call_classifier_chunked(classifier.decision_function, X_grid) + decision_values = _call_classifier_chunked(classifier.decision_function, + X_grid) levels = [0] if threshold is None else [threshold] fill_levels = [decision_values.min()] + levels + [ decision_values.max()] else: # no decision_function - decision_values = _call_classifier_chunked(classifier.predict_proba, X_grid)[:, 1] + decision_values = _call_classifier_chunked(classifier.predict_proba, + X_grid)[:, 1] levels = [.5] if threshold is None else [threshold] fill_levels = [0] + levels + [1] if fill: @@ -138,7 +140,7 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1, from sklearn.datasets import make_blobs from sklearn.linear_model import LogisticRegression X, y = make_blobs(centers=2, random_state=42) - clf = LogisticRegression().fit(X, y) + clf = LogisticRegression(solver='lbfgs').fit(X, y) plot_2d_separator(clf, X, fill=True) discrete_scatter(X[:, 0], X[:, 1], y) plt.show()