Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revised chunking implemented for Issue #67 for improved memory management #106

Merged
merged 2 commits into from
Dec 13, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions mglearn/plot_2d_separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@
import matplotlib.pyplot as plt
from .plot_helpers import cm2, cm3, discrete_scatter

def _call_classifier_chunked(classifier_pred_or_decide, X):
# The chunk_size is used to chunk the large arrays to work with x86
# memory models that are restricted to < 2 GB in memory allocation. The
# chunk_size value used here is based on a measurement with the
# MLPClassifier using the following parameters:
# MLPClassifier(solver='lbfgs', random_state=0,
# hidden_layer_sizes=[1000,1000,1000])
# by reducing the value it is possible to trade in time for memory.
# It is possible to chunk the array as the calculations are independent of
# each other.
# Note: an intermittent version made a distinction between
# 32- and 64 bit architectures avoiding the chunking. Testing revealed
# that even on 64 bit architectures the chunking increases the
# performance by a factor of 3-5, largely due to the avoidance of memory
# swapping.
chunk_size = 10000

# We use a list to collect all result chunks
Y_result_chunks = []

# Call the classifier in chunks.
for x_chunk in np.array_split(X, np.arange(chunk_size, X.shape[0],
chunk_size, dtype=np.int32),
axis=0):
Y_result_chunks.append(classifier_pred_or_decide(x_chunk))

return np.concatenate(Y_result_chunks)


def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None,
alpha=1, cm=cm3):
Expand Down Expand Up @@ -82,14 +110,16 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1,

X1, X2 = np.meshgrid(xx, yy)
X_grid = np.c_[X1.ravel(), X2.ravel()]
try:
decision_values = classifier.decision_function(X_grid)
if hasattr(classifier, "decision_function"):
decision_values = _call_classifier_chunked(classifier.decision_function,
X_grid)
levels = [0] if threshold is None else [threshold]
fill_levels = [decision_values.min()] + levels + [
decision_values.max()]
except AttributeError:
else:
# no decision_function
decision_values = classifier.predict_proba(X_grid)[:, 1]
decision_values = _call_classifier_chunked(classifier.predict_proba,
X_grid)[:, 1]
levels = [.5] if threshold is None else [threshold]
fill_levels = [0] + levels + [1]
if fill:
Expand All @@ -110,7 +140,7 @@ def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1,
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
X, y = make_blobs(centers=2, random_state=42)
clf = LogisticRegression().fit(X, y)
clf = LogisticRegression(solver='lbfgs').fit(X, y)
plot_2d_separator(clf, X, fill=True)
discrete_scatter(X[:, 0], X[:, 1], y)
plt.show()