Skip to content

Commit 546a6c5

Browse files
authored
Merge pull request #122 from RohitDhankar/dev_torn
improper plots
2 parents d51be15 + 841523d commit 546a6c5

File tree

2 files changed

+218
-131
lines changed

2 files changed

+218
-131
lines changed

src/rasbt_ml_book_code/rasbt_ch03.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
- more importantly, they can degrade the predictive performance of many machine learning algorithms. Unscaled data can also slow down or even prevent the convergence of many gradient-based estimators.
66
- more importantly that all features vary on comparable scales.
77
-
8-
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
9-
https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#plot-all-scaling-standard-scaler-section
8+
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
9+
- https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#plot-all-scaling-standard-scaler-section
1010
11-
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler.transform
11+
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler.transform
12+
```
1213
transform(X[, copy])
1314
Perform standardization by centering and scaling.
15+
```
1416
"""
1517
#
1618
"""

src/rasbt_ml_book_code/scikitLearn_std_scalar_1.py

+213-128
Original file line numberDiff line numberDiff line change
@@ -30,139 +30,224 @@
3030
print("[INFO]--df_calif_h.keys()--",df_calif_h.frame) # None
3131

3232
X_full, y_full = df_calif_h.data, df_calif_h.target
33+
3334
feature_names = df_calif_h.feature_names
34-
print("[INFO]--df_calif---feature_names--",feature_names) # - ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
35-
36-
# feature_mapping = {
37-
# "MedInc": "Median income in block",
38-
# "HouseAge": "Median house age in block",
39-
# "AveRooms": "Average number of rooms",
40-
# "AveBedrms": "Average number of bedrooms",
41-
# "Population": "Block population",
42-
# "AveOccup": "Average house occupancy",
43-
# "Latitude": "House block latitude",
44-
# "Longitude": "House block longitude",
45-
# }
35+
print("[INFO]--df_calif---feature_names--",feature_names)
36+
# - ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
37+
38+
feature_mapping = {
39+
"MedInc": "Median income in block",
40+
"HouseAge": "Median house age in block",
41+
"AveRooms": "Average number of rooms",
42+
"AveBedrms": "Average number of bedrooms",
43+
"Population": "Block population",
44+
"AveOccup": "Average house occupancy",
45+
"Latitude": "House block latitude",
46+
"Longitude": "House block longitude",
47+
}
4648

4749
# # Take only 2 features to make visualization easier
4850
# # Feature MedInc has a long tail distribution.
4951
# # Feature AveOccup has a few but very large outliers.
5052

51-
# features = ["MedInc", "AveOccup"]
52-
# features_idx = [feature_names.index(feature) for feature in features]
53-
# X = X_full[:, features_idx]
54-
# distributions = [
55-
# ("Unscaled data", X),
56-
# ("Data after standard scaling", StandardScaler().fit_transform(X)),
57-
# ("Data after min-max scaling", MinMaxScaler().fit_transform(X)),
58-
# ("Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),
59-
# (
60-
# "Data after robust scaling",
61-
# RobustScaler(quantile_range=(25, 75)).fit_transform(X),
62-
# ),
63-
# (
64-
# "Data after power transformation (Yeo-Johnson)",
65-
# PowerTransformer(method="yeo-johnson").fit_transform(X),
66-
# ),
67-
# (
68-
# "Data after power transformation (Box-Cox)",
69-
# PowerTransformer(method="box-cox").fit_transform(X),
70-
# ),
71-
# (
72-
# "Data after quantile transformation (uniform pdf)",
73-
# QuantileTransformer(
74-
# output_distribution="uniform", random_state=42
75-
# ).fit_transform(X),
76-
# ),
77-
# (
78-
# "Data after quantile transformation (gaussian pdf)",
79-
# QuantileTransformer(
80-
# output_distribution="normal", random_state=42
81-
# ).fit_transform(X),
82-
# ),
83-
# ("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),
84-
# ]
85-
86-
# # scale the output between 0 and 1 for the colorbar
87-
# y = minmax_scale(y_full)
53+
features = ["MedInc", "AveOccup"]
54+
features_idx = [feature_names.index(feature) for feature in features]
55+
print("--features_idx--",features_idx)
56+
print("--features_idx----X_full.shape-",X_full.shape)
57+
X = X_full[:, features_idx]
58+
59+
print("--features_idx---type(X)-",type(X))
60+
print("--features_idx---X.shape-",X.shape) #---X.shape- (20640, 2)
61+
62+
63+
distributions = [
64+
("Unscaled data", X),
65+
("Data after standard scaling", StandardScaler().fit_transform(X)),
66+
("Data after min-max scaling", MinMaxScaler().fit_transform(X)),
67+
("Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),
68+
(
69+
"Data after robust scaling",
70+
RobustScaler(quantile_range=(25, 75)).fit_transform(X),
71+
),
72+
(
73+
"Data after power transformation (Yeo-Johnson)",
74+
PowerTransformer(method="yeo-johnson").fit_transform(X),
75+
),
76+
(
77+
"Data after power transformation (Box-Cox)",
78+
PowerTransformer(method="box-cox").fit_transform(X),
79+
),
80+
(
81+
"Data after quantile transformation (uniform pdf)",
82+
QuantileTransformer(
83+
output_distribution="uniform", random_state=42
84+
).fit_transform(X),
85+
),
86+
(
87+
"Data after quantile transformation (gaussian pdf)",
88+
QuantileTransformer(
89+
output_distribution="normal", random_state=42
90+
).fit_transform(X),
91+
),
92+
("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),
93+
]
94+
95+
# scale the output between 0 and 1 for the colorbar
96+
y = minmax_scale(y_full)
8897

8998
# # plasma does not exist in matplotlib < 1.5
90-
# cmap = getattr(cm, "plasma_r", cm.hot_r)
91-
92-
93-
# def create_axes(title, figsize=(16, 6)):
94-
# fig = plt.figure(figsize=figsize)
95-
# fig.suptitle(title)
96-
97-
# # define the axis for the first plot
98-
# left, width = 0.1, 0.22
99-
# bottom, height = 0.1, 0.7
100-
# bottom_h = height + 0.15
101-
# left_h = left + width + 0.02
102-
103-
# rect_scatter = [left, bottom, width, height]
104-
# rect_histx = [left, bottom_h, width, 0.1]
105-
# rect_histy = [left_h, bottom, 0.05, height]
106-
107-
# ax_scatter = plt.axes(rect_scatter)
108-
# ax_histx = plt.axes(rect_histx)
109-
# ax_histy = plt.axes(rect_histy)
110-
111-
# # define the axis for the zoomed-in plot
112-
# left = width + left + 0.2
113-
# left_h = left + width + 0.02
114-
115-
# rect_scatter = [left, bottom, width, height]
116-
# rect_histx = [left, bottom_h, width, 0.1]
117-
# rect_histy = [left_h, bottom, 0.05, height]
118-
119-
# ax_scatter_zoom = plt.axes(rect_scatter)
120-
# ax_histx_zoom = plt.axes(rect_histx)
121-
# ax_histy_zoom = plt.axes(rect_histy)
122-
123-
# # define the axis for the colorbar
124-
# left, width = width + left + 0.13, 0.01
125-
126-
# rect_colorbar = [left, bottom, width, height]
127-
# ax_colorbar = plt.axes(rect_colorbar)
128-
129-
# return (
130-
# (ax_scatter, ax_histy, ax_histx),
131-
# (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),
132-
# ax_colorbar,
133-
# )
134-
135-
136-
# def plot_distribution(axes, X, y, hist_nbins=50, title="", x0_label="", x1_label=""):
137-
# ax, hist_X1, hist_X0 = axes
138-
139-
# ax.set_title(title)
140-
# ax.set_xlabel(x0_label)
141-
# ax.set_ylabel(x1_label)
142-
143-
# # The scatter plot
144-
# colors = cmap(y)
145-
# ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker="o", s=5, lw=0, c=colors)
146-
147-
# # Removing the top and the right spine for aesthetics
148-
# # make nice axis layout
149-
# ax.spines["top"].set_visible(False)
150-
# ax.spines["right"].set_visible(False)
151-
# ax.get_xaxis().tick_bottom()
152-
# ax.get_yaxis().tick_left()
153-
# ax.spines["left"].set_position(("outward", 10))
154-
# ax.spines["bottom"].set_position(("outward", 10))
155-
156-
# # Histogram for axis X1 (feature 5)
157-
# hist_X1.set_ylim(ax.get_ylim())
158-
# hist_X1.hist(
159-
# X[:, 1], bins=hist_nbins, orientation="horizontal", color="grey", ec="grey"
160-
# )
161-
# hist_X1.axis("off")
162-
163-
# # Histogram for axis X0 (feature 0)
164-
# hist_X0.set_xlim(ax.get_xlim())
165-
# hist_X0.hist(
166-
# X[:, 0], bins=hist_nbins, orientation="vertical", color="grey", ec="grey"
167-
# )
168-
# hist_X0.axis("off")
99+
cmap = getattr(cm, "plasma_r", cm.hot_r)
100+
101+
102+
def create_axes(title, figsize=(16, 6)):
103+
"""
104+
"""
105+
106+
fig = plt.figure(figsize=figsize)
107+
fig.suptitle(title)
108+
109+
# define the axis for the first plot
110+
left, width = 0.1, 0.22
111+
bottom, height = 0.1, 0.7
112+
bottom_h = height + 0.15
113+
left_h = left + width + 0.02
114+
115+
rect_scatter = [left, bottom, width, height]
116+
rect_histx = [left, bottom_h, width, 0.1]
117+
rect_histy = [left_h, bottom, 0.05, height]
118+
119+
ax_scatter = plt.axes(rect_scatter)
120+
ax_histx = plt.axes(rect_histx)
121+
ax_histy = plt.axes(rect_histy)
122+
123+
# define the axis for the zoomed-in plot
124+
left = width + left + 0.2
125+
left_h = left + width + 0.02
126+
127+
# rect_scatter = [left, bottom, width, height]
128+
# rect_histx = [left, bottom_h, width, 0.1]
129+
# rect_histy = [left_h, bottom, 0.05, height]
130+
131+
ax_scatter_zoom = plt.axes(rect_scatter)
132+
ax_histx_zoom = plt.axes(rect_histx)
133+
ax_histy_zoom = plt.axes(rect_histy)
134+
135+
# define the axis for the colorbar
136+
left, width = width + left + 0.13, 0.01
137+
138+
rect_colorbar = [left, bottom, width, height]
139+
ax_colorbar = plt.axes(rect_colorbar)
140+
#plt.show() # OK Legends Only
141+
142+
return (
143+
(ax_scatter, ax_histy, ax_histx),
144+
(ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),
145+
ax_colorbar,
146+
)
147+
148+
149+
def plot_distribution(axes, X, y, hist_nbins=50, title="", x0_label="", x1_label=""):
150+
151+
"""
152+
"""
153+
print("[INFO]---plot_distribution-axes---",axes)
154+
print("[INFO]---plot_distribution-y---",y)
155+
156+
ax, hist_X1, hist_X0 = axes
157+
158+
ax.set_title(title)
159+
print("[INFO]---plot_distribution----title--",title)
160+
ax.set_xlabel(x0_label)
161+
ax.set_ylabel(x1_label)
162+
163+
# The scatter plot
164+
colors = cmap(y)
165+
ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker="o", s=5, lw=0, c=colors)
166+
167+
168+
# Removing the top and the right spine for aesthetics
169+
# make nice axis layout
170+
ax.spines["top"].set_visible(False)
171+
ax.spines["right"].set_visible(False)
172+
ax.get_xaxis().tick_bottom()
173+
ax.get_yaxis().tick_left()
174+
ax.spines["left"].set_position(("outward", 10))
175+
ax.spines["bottom"].set_position(("outward", 10))
176+
177+
# Histogram for axis X1 (feature 5)
178+
hist_X1.set_ylim(ax.get_ylim())
179+
hist_X1.hist(
180+
X[:, 1], bins=hist_nbins, orientation="horizontal", color="grey", ec="grey"
181+
)
182+
hist_X1.axis("off")
183+
184+
print("---plot_distribution---type(hist_X1---",type(hist_X1))
185+
186+
# Histogram for axis X0 (feature 0)
187+
hist_X0.set_xlim(ax.get_xlim())
188+
hist_X0.hist(
189+
X[:, 0], bins=hist_nbins, orientation="vertical", color="grey", ec="grey"
190+
)
191+
hist_X0.axis("off")
192+
print("---plot_distribution---type(hist_X0---",type(hist_X0))
193+
## <class 'matplotlib.axes._axes.Axes'>
194+
#plt.show() # OK_1
195+
196+
197+
def make_plot(item_idx):
198+
"""
199+
"""
200+
print("--make_plot-making---")
201+
title, X = distributions[item_idx]
202+
print("--make_plot-----title-",title)
203+
204+
ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes(title)
205+
axarr = (ax_zoom_out, ax_zoom_in)
206+
plot_distribution(
207+
axarr[0],
208+
X,
209+
y,
210+
hist_nbins=200,
211+
x0_label=feature_mapping[features[0]],
212+
x1_label=feature_mapping[features[1]],
213+
title="Full data",
214+
)
215+
216+
# zoom-in
217+
zoom_in_percentile_range = (0, 99)
218+
cutoffs_X0 = np.percentile(X[:, 0], zoom_in_percentile_range)
219+
cutoffs_X1 = np.percentile(X[:, 1], zoom_in_percentile_range)
220+
221+
non_outliers_mask = np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) & np.all(
222+
X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1
223+
)
224+
print("--make_plot-----non_outliers_mask-",type(non_outliers_mask))
225+
print("--make_plot-----non_outliers_mask.shape---",non_outliers_mask.shape)
226+
227+
plot_distribution(
228+
axarr[1],
229+
X[non_outliers_mask],
230+
y[non_outliers_mask],
231+
hist_nbins=50,
232+
x0_label=feature_mapping[features[0]],
233+
x1_label=feature_mapping[features[1]],
234+
title="Zoom-in",
235+
)
236+
237+
norm = mpl.colors.Normalize(y_full.min(), y_full.max())
238+
mpl.colorbar.ColorbarBase(
239+
ax_colorbar,
240+
cmap=cmap,
241+
norm=norm,
242+
orientation="vertical",
243+
label="Color mapping for values of y",
244+
)
245+
246+
247+
if __name__ == "__main__":
248+
#make_plot(0)
249+
#plt.show() #OK_1
250+
make_plot(1)
251+
plt.show() # OK_2
252+
# make_plot(2)
253+
# plt.show() # OK_2

0 commit comments

Comments
 (0)