Skip to content

Commit 6a65d9d

Browse files
author
Xing
authored
Merge pull request #2275 from xhlulu/ml-docs
Move the PR to plotly
2 parents 46cd47f + 77a3d82 commit 6a65d9d

File tree

5 files changed

+1097
-0
lines changed

5 files changed

+1097
-0
lines changed

doc/python/ml-knn.md

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
---
2+
jupyter:
3+
jupytext:
4+
notebook_metadata_filter: all
5+
text_representation:
6+
extension: .md
7+
format_name: markdown
8+
format_version: '1.1'
9+
jupytext_version: 1.1.1
10+
kernelspec:
11+
display_name: Python 3
12+
language: python
13+
name: python3
14+
language_info:
15+
codemirror_mode:
16+
name: ipython
17+
version: 3
18+
file_extension: .py
19+
mimetype: text/x-python
20+
name: python
21+
nbconvert_exporter: python
22+
pygments_lexer: ipython3
23+
version: 3.7.6
24+
plotly:
25+
description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification
26+
with Plotly
27+
display_as: ai_ml
28+
language: python
29+
layout: base
30+
name: kNN Classification
31+
order: 1
32+
page_type: example_index
33+
permalink: python/knn-classification/
34+
thumbnail: thumbnail/knn-classification.png
35+
---
36+
37+
## Basic Binary Classification with `plotly.express`
38+
39+
```python
40+
import numpy as np
41+
import plotly.express as px
42+
import plotly.graph_objects as go
43+
from sklearn.datasets import make_moons
44+
from sklearn.neighbors import KNeighborsClassifier
45+
46+
X, y = make_moons(noise=0.3, random_state=0)
47+
X_test, _ = make_moons(noise=0.3, random_state=1)
48+
49+
clf = KNeighborsClassifier(15)
50+
clf.fit(X, y.astype(str)) # Fit on training set
51+
y_pred = clf.predict(X_test) # Predict on new data
52+
53+
fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred, labels={'color': 'predicted'})
54+
fig.update_traces(marker_size=10)
55+
fig.show()
56+
```
57+
58+
## Visualize Binary Prediction Scores
59+
60+
```python
61+
import numpy as np
62+
import plotly.express as px
63+
import plotly.graph_objects as go
64+
from sklearn.datasets import make_classification
65+
from sklearn.neighbors import KNeighborsClassifier
66+
67+
X, y = make_classification(n_features=2, n_redundant=0, random_state=0)
68+
X_test, _ = make_classification(n_features=2, n_redundant=0, random_state=1)
69+
70+
clf = KNeighborsClassifier(15)
71+
clf.fit(X, y) # Fit on training set
72+
y_score = clf.predict_proba(X_test)[:, 1] # Predict on new data
73+
74+
fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_score, labels={'color': 'score'})
75+
fig.update_traces(marker_size=10)
76+
fig.show()
77+
```
78+
79+
## Probability Estimates with `go.Contour`
80+
81+
```python
82+
import numpy as np
83+
import plotly.express as px
84+
import plotly.graph_objects as go
85+
from sklearn.datasets import make_moons
86+
from sklearn.neighbors import KNeighborsClassifier
87+
88+
mesh_size = .02
89+
margin = 1
90+
91+
X, y = make_moons(noise=0.3, random_state=0)
92+
93+
# Create a mesh grid on which we will run our model
94+
x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin
95+
y_min, y_max = X[:, 1].min() - margin, X[:, 1].max() + margin
96+
xrange = np.arange(x_min, x_max, mesh_size)
97+
yrange = np.arange(y_min, y_max, mesh_size)
98+
xx, yy = np.meshgrid(xrange, yrange)
99+
100+
# Create classifier, run predictions on grid
101+
clf = KNeighborsClassifier(15, weights='uniform')
102+
clf.fit(X, y)
103+
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
104+
Z = Z.reshape(xx.shape)
105+
106+
fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''})
107+
fig.update_traces(marker_size=10, marker_line_width=1)
108+
fig.add_trace(
109+
go.Contour(
110+
x=xrange,
111+
y=yrange,
112+
z=Z,
113+
showscale=False,
114+
colorscale=['Blue', 'Red'],
115+
opacity=0.4,
116+
name='Confidence'
117+
)
118+
)
119+
fig.show()
120+
```
121+
122+
## Multi-class prediction confidence with `go.Heatmap`
123+
124+
```python
125+
import numpy as np
126+
import plotly.express as px
127+
import plotly.graph_objects as go
128+
from sklearn.neighbors import KNeighborsClassifier
129+
130+
mesh_size = .02
131+
margin = 1
132+
133+
# We will use the iris data, which is included in px
134+
df = px.data.iris()
135+
X = df[['sepal_length', 'sepal_width']]
136+
y = df.species_id
137+
138+
# Create a mesh grid on which we will run our model
139+
l_min, l_max = df.sepal_length.min() - margin, df.sepal_length.max() + margin
140+
w_min, w_max = df.sepal_width.min() - margin, df.sepal_width.max() + margin
141+
lrange = np.arange(l_min, l_max, mesh_size)
142+
wrange = np.arange(w_min, w_max, mesh_size)
143+
ll, ww = np.meshgrid(lrange, wrange)
144+
145+
# Create classifier, run predictions on grid
146+
clf = KNeighborsClassifier(15, weights='distance')
147+
clf.fit(X, y)
148+
Z = clf.predict(np.c_[ll.ravel(), ww.ravel()])
149+
Z = Z.reshape(ll.shape)
150+
proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()])
151+
proba = proba.reshape(ll.shape + (3,))
152+
153+
fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species', width=1000, height=1000)
154+
fig.update_traces(marker_size=10, marker_line_width=1)
155+
fig.add_trace(
156+
go.Heatmap(
157+
x=lrange,
158+
y=wrange,
159+
z=Z,
160+
showscale=False,
161+
colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']],
162+
opacity=0.25,
163+
customdata=proba,
164+
hovertemplate=(
165+
'sepal length: %{x} <br>'
166+
'sepal width: %{y} <br>'
167+
'p(setosa): %{customdata[0]:.3f}<br>'
168+
'p(versicolor): %{customdata[1]:.3f}<br>'
169+
'p(virginica): %{customdata[2]:.3f}<extra></extra>'
170+
)
171+
)
172+
)
173+
fig.show()
174+
```
175+
176+
## 3D Classification with `px.scatter_3d`
177+
178+
```python
179+
import numpy as np
180+
import plotly.express as px
181+
import plotly.graph_objects as go
182+
from sklearn.neighbors import KNeighborsClassifier
183+
from sklearn.model_selection import train_test_split
184+
185+
df = px.data.iris()
186+
features = ["sepal_width", "sepal_length", "petal_width"]
187+
188+
X = df[features]
189+
y = df.species
190+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
191+
192+
# Create classifier, run predictions on grid
193+
clf = KNeighborsClassifier(15, weights='distance')
194+
clf.fit(X_train, y_train)
195+
y_pred = clf.predict(X_test)
196+
y_score = clf.predict_proba(X_test)
197+
y_score = np.around(y_score.max(axis=1), 4)
198+
199+
fig = px.scatter_3d(
200+
X_test,
201+
x='sepal_length',
202+
y='sepal_width',
203+
z='petal_width',
204+
symbol=y_pred,
205+
color=y_score,
206+
labels={'symbol': 'prediction', 'color': 'score'}
207+
)
208+
fig.update_layout(legend=dict(x=0, y=0))
209+
fig.show()
210+
```
211+
212+
## High Dimension Visualization with `px.scatter_matrix`
213+
214+
If you need to visualize classifications that go beyond 3D, you can use the [scatter plot matrix](https://plot.ly/python/splom/).
215+
216+
```python
217+
import numpy as np
218+
import plotly.express as px
219+
import plotly.graph_objects as go
220+
from sklearn.neighbors import KNeighborsClassifier
221+
from sklearn.model_selection import train_test_split
222+
223+
df = px.data.iris()
224+
features = ["sepal_width", "sepal_length", "petal_width", "petal_length"]
225+
226+
X = df[features]
227+
y = df.species
228+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
229+
230+
# Create classifier, run predictions on grid
231+
clf = KNeighborsClassifier(15, weights='distance')
232+
clf.fit(X_train, y_train)
233+
y_pred = clf.predict(X_test)
234+
235+
fig = px.scatter_matrix(X_test, dimensions=features, color=y_pred, labels={'color': 'prediction'})
236+
fig.show()
237+
```
238+
239+
### Reference
240+
241+
Learn more about `px`, `go.Contour`, and `go.Heatmap` here:
242+
* https://plot.ly/python/plotly-express/
243+
* https://plot.ly/python/heatmaps/
244+
* https://plot.ly/python/contour-plots/
245+
* https://plot.ly/python/3d-scatter-plots/
246+
* https://plot.ly/python/splom/
247+
248+
This tutorial was inspired by amazing examples from the official scikit-learn docs:
249+
* https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html
250+
* https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
251+
* https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html

doc/python/ml-pca.md

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
---
2+
jupyter:
3+
jupytext:
4+
notebook_metadata_filter: all
5+
text_representation:
6+
extension: .md
7+
format_name: markdown
8+
format_version: '1.1'
9+
jupytext_version: 1.1.1
10+
kernelspec:
11+
display_name: Python 3
12+
language: python
13+
name: python3
14+
language_info:
15+
codemirror_mode:
16+
name: ipython
17+
version: 3
18+
file_extension: .py
19+
mimetype: text/x-python
20+
name: python
21+
nbconvert_exporter: python
22+
pygments_lexer: ipython3
23+
version: 3.7.6
24+
plotly:
25+
description: Visualize Principle Component Analysis (PCA) of your high-dimensional
26+
data with Plotly on Python.
27+
display_as: ai_ml
28+
language: python
29+
layout: base
30+
name: PCA Visualization
31+
order: 4
32+
page_type: example_index
33+
permalink: python/pca-visualization/
34+
thumbnail: thumbnail/ml-pca.png
35+
---
36+
37+
## Basic PCA Scatter Plot
38+
39+
This example shows you how to simply visualize the first two principal components of a PCA, by reducing a dataset of 4 dimensions to 2D. It uses scikit-learn's `PCA`.
40+
41+
```python
42+
import plotly.express as px
43+
from sklearn.decomposition import PCA
44+
45+
df = px.data.iris()
46+
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
47+
48+
pca = PCA(n_components=2)
49+
components = pca.fit_transform(X)
50+
51+
fig = px.scatter(x=components[:, 0], y=components[:, 1], color=df['species'])
52+
fig.show()
53+
```
54+
55+
## Visualize PCA with `px.scatter_3d`
56+
57+
Just like the basic PCA plot, this let you visualize the first 3 dimensions. This additionally displays the total variance explained by those components.
58+
59+
```python
60+
import plotly.express as px
61+
from sklearn.decomposition import PCA
62+
63+
df = px.data.iris()
64+
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
65+
66+
pca = PCA(n_components=3)
67+
components = pca.fit_transform(X)
68+
69+
total_var = pca.explained_variance_ratio_.sum() * 100
70+
71+
fig = px.scatter_3d(
72+
x=components[:, 0], y=components[:, 1], z=components[:, 2],
73+
color=df['species'],
74+
title=f'Total Explained Variance: {total_var:.2f}%',
75+
labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'},
76+
)
77+
fig.show()
78+
```
79+
80+
## Plot high-dimensional components with `px.scatter_matrix`
81+
82+
If you need to visualize more than 3 dimensions, you can use scatter plot matrices.
83+
84+
```python
85+
import pandas as pd
86+
from sklearn.decomposition import PCA
87+
from sklearn.datasets import load_boston
88+
89+
boston = load_boston()
90+
df = pd.DataFrame(boston.data, columns=boston.feature_names)
91+
92+
pca = PCA(n_components=5)
93+
components = pca.fit_transform(df)
94+
95+
total_var = pca.explained_variance_ratio_.sum() * 100
96+
97+
labels = {str(i): f"PC {i+1}" for i in range(5)}
98+
labels['color'] = 'Median Price'
99+
100+
fig = px.scatter_matrix(
101+
components,
102+
color=boston.target,
103+
dimensions=range(5),
104+
labels=labels,
105+
title=f'Total Explained Variance: {total_var:.2f}%',
106+
)
107+
fig.update_traces(diagonal_visible=False)
108+
fig.show()
109+
```
110+
111+
## Plotting explained variance
112+
113+
Often, you might be interested in seeing how much variance the PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset).
114+
115+
```python
116+
import numpy as np
117+
import pandas as pd
118+
from sklearn.decomposition import PCA
119+
from sklearn.datasets import load_diabetes
120+
121+
boston = load_diabetes()
122+
df = pd.DataFrame(boston.data, columns=boston.feature_names)
123+
124+
pca = PCA()
125+
pca.fit(df)
126+
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
127+
128+
px.area(
129+
x=range(1, exp_var_cumul.shape[0] + 1),
130+
y=exp_var_cumul,
131+
labels={"x": "# Components", "y": "Explained Variance"}
132+
)
133+
```
134+
135+
## Visualize loadings

0 commit comments

Comments
 (0)