Skip to content

improper plots #122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/rasbt_ml_book_code/rasbt_ch03.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
- more importantly, they can degrade the predictive performance of many machine learning algorithms. Unscaled data can also slow down or even prevent the convergence of many gradient-based estimators.
- more importantly that all features vary on comparable scales.
-
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#plot-all-scaling-standard-scaler-section
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
- https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#plot-all-scaling-standard-scaler-section

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler.transform
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler.transform
```
transform(X[, copy])
Perform standardization by centering and scaling.
```
"""
#
"""
Expand Down
341 changes: 213 additions & 128 deletions src/rasbt_ml_book_code/scikitLearn_std_scalar_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,139 +30,224 @@
print("[INFO]--df_calif_h.keys()--",df_calif_h.frame) # None

X_full, y_full = df_calif_h.data, df_calif_h.target

feature_names = df_calif_h.feature_names
print("[INFO]--df_calif---feature_names--",feature_names) # - ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

# feature_mapping = {
# "MedInc": "Median income in block",
# "HouseAge": "Median house age in block",
# "AveRooms": "Average number of rooms",
# "AveBedrms": "Average number of bedrooms",
# "Population": "Block population",
# "AveOccup": "Average house occupancy",
# "Latitude": "House block latitude",
# "Longitude": "House block longitude",
# }
print("[INFO]--df_calif---feature_names--",feature_names)
# - ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

feature_mapping = {
"MedInc": "Median income in block",
"HouseAge": "Median house age in block",
"AveRooms": "Average number of rooms",
"AveBedrms": "Average number of bedrooms",
"Population": "Block population",
"AveOccup": "Average house occupancy",
"Latitude": "House block latitude",
"Longitude": "House block longitude",
}

# # Take only 2 features to make visualization easier
# # Feature MedInc has a long tail distribution.
# # Feature AveOccup has a few but very large outliers.

# features = ["MedInc", "AveOccup"]
# features_idx = [feature_names.index(feature) for feature in features]
# X = X_full[:, features_idx]
# distributions = [
# ("Unscaled data", X),
# ("Data after standard scaling", StandardScaler().fit_transform(X)),
# ("Data after min-max scaling", MinMaxScaler().fit_transform(X)),
# ("Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),
# (
# "Data after robust scaling",
# RobustScaler(quantile_range=(25, 75)).fit_transform(X),
# ),
# (
# "Data after power transformation (Yeo-Johnson)",
# PowerTransformer(method="yeo-johnson").fit_transform(X),
# ),
# (
# "Data after power transformation (Box-Cox)",
# PowerTransformer(method="box-cox").fit_transform(X),
# ),
# (
# "Data after quantile transformation (uniform pdf)",
# QuantileTransformer(
# output_distribution="uniform", random_state=42
# ).fit_transform(X),
# ),
# (
# "Data after quantile transformation (gaussian pdf)",
# QuantileTransformer(
# output_distribution="normal", random_state=42
# ).fit_transform(X),
# ),
# ("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),
# ]

# # scale the output between 0 and 1 for the colorbar
# y = minmax_scale(y_full)
features = ["MedInc", "AveOccup"]
features_idx = [feature_names.index(feature) for feature in features]
print("--features_idx--",features_idx)
print("--features_idx----X_full.shape-",X_full.shape)
X = X_full[:, features_idx]

print("--features_idx---type(X)-",type(X))
print("--features_idx---X.shape-",X.shape) #---X.shape- (20640, 2)


distributions = [
("Unscaled data", X),
("Data after standard scaling", StandardScaler().fit_transform(X)),
("Data after min-max scaling", MinMaxScaler().fit_transform(X)),
("Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),
(
"Data after robust scaling",
RobustScaler(quantile_range=(25, 75)).fit_transform(X),
),
(
"Data after power transformation (Yeo-Johnson)",
PowerTransformer(method="yeo-johnson").fit_transform(X),
),
(
"Data after power transformation (Box-Cox)",
PowerTransformer(method="box-cox").fit_transform(X),
),
(
"Data after quantile transformation (uniform pdf)",
QuantileTransformer(
output_distribution="uniform", random_state=42
).fit_transform(X),
),
(
"Data after quantile transformation (gaussian pdf)",
QuantileTransformer(
output_distribution="normal", random_state=42
).fit_transform(X),
),
("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),
]

# scale the output between 0 and 1 for the colorbar
y = minmax_scale(y_full)

# # plasma does not exist in matplotlib < 1.5
# cmap = getattr(cm, "plasma_r", cm.hot_r)


# def create_axes(title, figsize=(16, 6)):
# fig = plt.figure(figsize=figsize)
# fig.suptitle(title)

# # define the axis for the first plot
# left, width = 0.1, 0.22
# bottom, height = 0.1, 0.7
# bottom_h = height + 0.15
# left_h = left + width + 0.02

# rect_scatter = [left, bottom, width, height]
# rect_histx = [left, bottom_h, width, 0.1]
# rect_histy = [left_h, bottom, 0.05, height]

# ax_scatter = plt.axes(rect_scatter)
# ax_histx = plt.axes(rect_histx)
# ax_histy = plt.axes(rect_histy)

# # define the axis for the zoomed-in plot
# left = width + left + 0.2
# left_h = left + width + 0.02

# rect_scatter = [left, bottom, width, height]
# rect_histx = [left, bottom_h, width, 0.1]
# rect_histy = [left_h, bottom, 0.05, height]

# ax_scatter_zoom = plt.axes(rect_scatter)
# ax_histx_zoom = plt.axes(rect_histx)
# ax_histy_zoom = plt.axes(rect_histy)

# # define the axis for the colorbar
# left, width = width + left + 0.13, 0.01

# rect_colorbar = [left, bottom, width, height]
# ax_colorbar = plt.axes(rect_colorbar)

# return (
# (ax_scatter, ax_histy, ax_histx),
# (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),
# ax_colorbar,
# )


# def plot_distribution(axes, X, y, hist_nbins=50, title="", x0_label="", x1_label=""):
# ax, hist_X1, hist_X0 = axes

# ax.set_title(title)
# ax.set_xlabel(x0_label)
# ax.set_ylabel(x1_label)

# # The scatter plot
# colors = cmap(y)
# ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker="o", s=5, lw=0, c=colors)

# # Removing the top and the right spine for aesthetics
# # make nice axis layout
# ax.spines["top"].set_visible(False)
# ax.spines["right"].set_visible(False)
# ax.get_xaxis().tick_bottom()
# ax.get_yaxis().tick_left()
# ax.spines["left"].set_position(("outward", 10))
# ax.spines["bottom"].set_position(("outward", 10))

# # Histogram for axis X1 (feature 5)
# hist_X1.set_ylim(ax.get_ylim())
# hist_X1.hist(
# X[:, 1], bins=hist_nbins, orientation="horizontal", color="grey", ec="grey"
# )
# hist_X1.axis("off")

# # Histogram for axis X0 (feature 0)
# hist_X0.set_xlim(ax.get_xlim())
# hist_X0.hist(
# X[:, 0], bins=hist_nbins, orientation="vertical", color="grey", ec="grey"
# )
# hist_X0.axis("off")
cmap = getattr(cm, "plasma_r", cm.hot_r)


def create_axes(title, figsize=(16, 6)):
"""
"""

fig = plt.figure(figsize=figsize)
fig.suptitle(title)

# define the axis for the first plot
left, width = 0.1, 0.22
bottom, height = 0.1, 0.7
bottom_h = height + 0.15
left_h = left + width + 0.02

rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.1]
rect_histy = [left_h, bottom, 0.05, height]

ax_scatter = plt.axes(rect_scatter)
ax_histx = plt.axes(rect_histx)
ax_histy = plt.axes(rect_histy)

# define the axis for the zoomed-in plot
left = width + left + 0.2
left_h = left + width + 0.02

# rect_scatter = [left, bottom, width, height]
# rect_histx = [left, bottom_h, width, 0.1]
# rect_histy = [left_h, bottom, 0.05, height]

ax_scatter_zoom = plt.axes(rect_scatter)
ax_histx_zoom = plt.axes(rect_histx)
ax_histy_zoom = plt.axes(rect_histy)

# define the axis for the colorbar
left, width = width + left + 0.13, 0.01

rect_colorbar = [left, bottom, width, height]
ax_colorbar = plt.axes(rect_colorbar)
#plt.show() # OK Legends Only

return (
(ax_scatter, ax_histy, ax_histx),
(ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),
ax_colorbar,
)


def plot_distribution(axes, X, y, hist_nbins=50, title="", x0_label="", x1_label=""):

"""
"""
print("[INFO]---plot_distribution-axes---",axes)
print("[INFO]---plot_distribution-y---",y)

ax, hist_X1, hist_X0 = axes

ax.set_title(title)
print("[INFO]---plot_distribution----title--",title)
ax.set_xlabel(x0_label)
ax.set_ylabel(x1_label)

# The scatter plot
colors = cmap(y)
ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker="o", s=5, lw=0, c=colors)


# Removing the top and the right spine for aesthetics
# make nice axis layout
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines["left"].set_position(("outward", 10))
ax.spines["bottom"].set_position(("outward", 10))

# Histogram for axis X1 (feature 5)
hist_X1.set_ylim(ax.get_ylim())
hist_X1.hist(
X[:, 1], bins=hist_nbins, orientation="horizontal", color="grey", ec="grey"
)
hist_X1.axis("off")

print("---plot_distribution---type(hist_X1---",type(hist_X1))

# Histogram for axis X0 (feature 0)
hist_X0.set_xlim(ax.get_xlim())
hist_X0.hist(
X[:, 0], bins=hist_nbins, orientation="vertical", color="grey", ec="grey"
)
hist_X0.axis("off")
print("---plot_distribution---type(hist_X0---",type(hist_X0))
## <class 'matplotlib.axes._axes.Axes'>
#plt.show() # OK_1


def make_plot(item_idx):
"""
"""
print("--make_plot-making---")
title, X = distributions[item_idx]
print("--make_plot-----title-",title)

ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes(title)
axarr = (ax_zoom_out, ax_zoom_in)
plot_distribution(
axarr[0],
X,
y,
hist_nbins=200,
x0_label=feature_mapping[features[0]],
x1_label=feature_mapping[features[1]],
title="Full data",
)

# zoom-in
zoom_in_percentile_range = (0, 99)
cutoffs_X0 = np.percentile(X[:, 0], zoom_in_percentile_range)
cutoffs_X1 = np.percentile(X[:, 1], zoom_in_percentile_range)

non_outliers_mask = np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) & np.all(
X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1
)
print("--make_plot-----non_outliers_mask-",type(non_outliers_mask))
print("--make_plot-----non_outliers_mask.shape---",non_outliers_mask.shape)

plot_distribution(
axarr[1],
X[non_outliers_mask],
y[non_outliers_mask],
hist_nbins=50,
x0_label=feature_mapping[features[0]],
x1_label=feature_mapping[features[1]],
title="Zoom-in",
)

norm = mpl.colors.Normalize(y_full.min(), y_full.max())
mpl.colorbar.ColorbarBase(
ax_colorbar,
cmap=cmap,
norm=norm,
orientation="vertical",
label="Color mapping for values of y",
)


if __name__ == "__main__":
#make_plot(0)
#plt.show() #OK_1
make_plot(1)
plt.show() # OK_2
# make_plot(2)
# plt.show() # OK_2