jupyter | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
This example shows how to use plotly.express
to train a simply Ordinary Least Square (OLS) that can predict the tips servers will receive based on the value of the total bill.
import plotly.express as px
df = px.data.tips()
fig = px.scatter(
df, x='total_bill', y='tip', opacity=0.65,
trendline='ols', trendline_color_override='red'
)
fig.show()
You can also perform the same prediction using scikit-learn's LinearRegression
.
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
df = px.data.tips()
X = df.total_bill.values.reshape(-1, 1)
model = LinearRegression()
model.fit(X, df.tip)
x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))
fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit'))
fig.show()
Easily color your plot based on a predefined data split.
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = px.data.tips()
X = df.total_bill.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, df.tip, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)
x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))
fig = go.Figure([
go.Scatter(x=X_train.squeeze(), y=y_train, name='train', mode='markers'),
go.Scatter(x=X_test.squeeze(), y=y_test, name='test', mode='markers'),
go.Scatter(x=x_range, y=y_range, name='prediction')
])
fig.show()
Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from px
.
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import KNeighborsRegressor
df = px.data.tips()
X = df.total_bill.values.reshape(-1, 1)
x_range = np.linspace(X.min(), X.max(), 100)
# Model #1
knn_dist = KNeighborsRegressor(10, weights='distance')
knn_dist.fit(X, df.tip)
y_dist = knn_dist.predict(x_range.reshape(-1, 1))
# Model #2
knn_uni = KNeighborsRegressor(10, weights='uniform')
knn_uni.fit(X, df.tip)
y_uni = knn_uni.predict(x_range.reshape(-1, 1))
fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform'))
fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance'))
fig.show()
Visualize the decision plane of your model whenever you have more than one variable in your X
.
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import KNeighborsRegressor
mesh_size = .02
margin = 0
df = px.data.iris()
X = df[['sepal_width', 'sepal_length']]
y = df['petal_width']
# Condition the model on sepal width and length, predict the petal width
knn = KNeighborsRegressor(10, weights='distance')
knn.fit(X, y)
# Create a mesh grid on which we will run our model
x_min, x_max = X.sepal_width.min() - margin, X.sepal_width.max() + margin
y_min, y_max = X.sepal_length.min() - margin, X.sepal_length.max() + margin
xrange = np.arange(x_min, x_max, mesh_size)
yrange = np.arange(y_min, y_max, mesh_size)
xx, yy = np.meshgrid(xrange, yrange)
# Run kNN
pred = knn.predict(np.c_[xx.ravel(), yy.ravel()])
pred = pred.reshape(xx.shape)
# Generate the plot
fig = px.scatter_3d(df, x='sepal_width', y='sepal_length', z='petal_width')
fig.update_traces(marker=dict(size=5))
fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface'))
fig.show()
It's easy to diplay latex equations in legend and titles by simply adding $
before and after your equation.
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
def format_coefs(coefs):
equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)]
equation = "$" + " + ".join(equation_list) + "$"
replace_map = {"x^0": "", "x^1": "x", '+ -': '- '}
for old, new in replace_map.items():
equation = equation.replace(old, new)
return equation
df = px.data.tips()
X = df.total_bill.values.reshape(-1, 1)
x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65)
for n_features in [1, 2, 3, 4]:
poly = PolynomialFeatures(n_features)
poly.fit(X)
X_poly = poly.transform(X)
x_range_poly = poly.transform(x_range)
model = LinearRegression(fit_intercept=False)
model.fit(X_poly, df.tip)
y_poly = model.predict(x_range_poly)
equation = format_coefs(model.coef_.round(2))
fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation))
fig.show()
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
df = px.data.iris()
X = df[['sepal_width', 'sepal_length']]
y = df['petal_width']
# Condition the model on sepal width and length, predict the petal width
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
fig = px.scatter(x=y_pred, y=y, labels={'x': 'prediction', 'y': 'actual'})
fig.add_shape(
type="line", line=dict(dash='dash'),
x0=y.min(), y0=y.min(),
x1=y.max(), y1=y.max()
)
fig.show()
Add marginal histograms to quickly diagnoses any prediction bias your model might have. The built-in OLS
functionality let you visualize how well your model generalizes by comparing it with the theoretical optimal fit (black dotted line).
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = px.data.iris()
# Split data into training and test splits
train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0)
df['split'] = 'train'
df.loc[test_idx, 'split'] = 'test'
X = df[['sepal_width', 'sepal_length']]
y = df['petal_width']
X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']]
y_train = df.loc[train_idx, 'petal_width']
# Condition the model on sepal width and length, predict the petal width
model = LinearRegression()
model.fit(X_train, y_train)
df['prediction'] = model.predict(X)
fig = px.scatter(
df, x='prediction', y='petal_width',
marginal_x='histogram', marginal_y='histogram',
color='split', trendline='ols'
)
fig.add_shape(
type="line", line=dict(dash='dash'),
x0=y.min(), y0=y.min(),
x1=y.max(), y1=y.max()
)
fig.show()
Just like prediction error plots, it's easy to visualize your prediction residuals in just a few lines of codes using plotly.express
built-in capabilities.
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = px.data.iris()
# Split data into training and test splits
train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0)
df['split'] = 'train'
df.loc[test_idx, 'split'] = 'test'
X = df[['sepal_width', 'sepal_length']]
X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']]
y_train = df.loc[train_idx, 'petal_width']
# Condition the model on sepal width and length, predict the petal width
model = LinearRegression()
model.fit(X_train, y_train)
df['prediction'] = model.predict(X)
df['residual'] = df['prediction'] - df['petal_width']
fig = px.scatter(
df, x='prediction', y='residual',
marginal_y='violin',
color='split', trendline='ols'
)
fig.show()
In this example, we show how to visualize the results of a grid search on a DecisionTreeRegressor
. The first plot shows how to visualize the score of each model parameter on individual splits (grouped using facets). The second plot aggregates the results of all splits such that each box represents a single model.
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
N_FOLD = 6
# Load and shuffle dataframe
df = px.data.iris()
df = df.sample(frac=1, random_state=0)
X = df[['sepal_width', 'sepal_length']]
y = df['petal_width']
# Define and fit the grid
model = DecisionTreeRegressor()
param_grid = {
'criterion': ['mse', 'friedman_mse', 'mae'],
'max_depth': range(2, 5)
}
grid = GridSearchCV(model, param_grid, cv=N_FOLD)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
# Convert the wide format of the grid into the long format
# accepted by plotly.express
melted = (
grid_df
.rename(columns=lambda col: col.replace('param_', ''))
.melt(
value_vars=[f'split{i}_test_score' for i in range(N_FOLD)],
id_vars=['mean_test_score', 'mean_fit_time', 'criterion', 'max_depth'],
var_name="cv_split",
value_name="r_squared"
)
)
# Format the variable names for simplicity
melted['cv_split'] = (
melted['cv_split']
.str.replace('_test_score', '')
.str.replace('split', '')
)
# Single function call to plot each figure
fig_hmap = px.density_heatmap(
melted, x="max_depth", y='criterion',
histfunc="sum", z="r_squared",
title='Grid search results on individual fold',
hover_data=['mean_fit_time'],
facet_col="cv_split", facet_col_wrap=3,
labels={'mean_test_score': "mean_r_squared"}
)
fig_box = px.box(
melted, x='max_depth', y='r_squared',
title='Grid search results ',
hover_data=['mean_fit_time'],
points='all',
color="criterion",
hover_name='cv_split',
labels={'mean_test_score': "mean_r_squared"}
)
# Display
fig_hmap.show()
fig_box.show()
Learn more about px
here:
This tutorial was inspired by amazing examples from the official scikit-learn docs: