ML Docs: Added 3 new sections to regression notebook

xhlulu · xhlulu · commit 61b3ad8e4d73 · 2020-08-11T15:30:10.000-04:00
diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md
@@ -33,9 +33,28 @@ jupyter:
     thumbnail: thumbnail/knn-classification.png
 ---
 
-## Basic linear regression
+## Basic linear regression plots
 
-This example shows how to train a simple linear regression from `sklearn` to predicts the tips servers will receive based on the value of the total bill (dataset is included in `px.data`).
+
+### Ordinary Least Square (OLS) with `plotly.express`
+
+
+This example shows how to use `plotly.express` to train a simply Ordinary Least Square (OLS) that can predict the tips servers will receive based on the value of the total bill.
+
+```python
+import plotly.express as px
+
+df = px.data.tips()
+fig = px.scatter(
+    df, x='total_bill', y='tip', opacity=0.65,
+    trendline='ols', trendline_color_override='red'
+)
+fig.show()
+```
+
+### Linear Regression with scikit-learn
+
+You can also perform the same prediction using scikit-learn's `LinearRegression`.
 
 ```python
 import numpy as np
@@ -123,7 +142,6 @@ mesh_size = .02
 margin = 0
 
 df = px.data.iris()
-features = ["sepal_width", "sepal_length", "petal_width"]
 
 X = df[['sepal_width', 'sepal_length']]
 y = df['petal_width']
@@ -150,10 +168,46 @@ fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface'))
 fig.show()
 ```
 
-## Label polynomial fits with latex
+## Displaying `PolynomialFeatures` using $\LaTeX$
+
+It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation.
 
 ```python
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import PolynomialFeatures
+
+def format_coefs(coefs):
+    equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)]
+    equation = "$" +  " + ".join(equation_list) + "$"
+    
+    replace_map = {"x^0": "", "x^1": "x", '+ -': '- '}
+    for old, new in replace_map.items():
+        equation = equation.replace(old, new)
+        
+    return equation
 
+df = px.data.tips()
+X = df.total_bill.values.reshape(-1, 1)
+x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
+
+fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65)
+for n_features in [1, 2, 3, 4]:
+    poly = PolynomialFeatures(n_features)
+    poly.fit(X)
+    X_poly = poly.transform(X)
+    x_range_poly = poly.transform(x_range)
+
+    model = LinearRegression(fit_intercept=False)
+    model.fit(X_poly, df.tip)
+    y_poly = model.predict(x_range_poly)
+    
+    equation = format_coefs(model.coef_.round(2))
+    fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation))
+
+fig.show()
 ```
 
 ## Prediction Error Plots
@@ -162,22 +216,160 @@ fig.show()
 ### Simple Prediction Error
 
 ```python
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.linear_model import LinearRegression
 
+df = px.data.iris()
+X = df.loc[train_idx, ['sepal_width', 'sepal_length']]
+y = df.loc[train_idx, 'petal_width']
+
+# Condition the model on sepal width and length, predict the petal width
+model = LinearRegression()
+model.fit(X, y)
+y_pred = model.predict(X)
+
+fig = px.scatter(x=y, y=y_pred, labels={'x': 'y true', 'y': 'y pred'})
+fig.add_shape(
+    type="line", line=dict(dash='dash'),
+    x0=y.min(), y0=y.min(), 
+    x1=y.max(), y1=y.max()
+)
+fig.show()
 ```
 
-### Augmented Prediction Error plot using `px`
+### Augmented Prediction Error analysis using `plotly.express`
 
 ```python
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
 
-```
+df = px.data.iris()
 
-### Grid Search Visualization using `px.scatter_matrix`
+# Split data into training and test splits
+train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0)
+df['split'] = 'train'
+df.loc[test_idx, 'split'] = 'test'
 
+X = df[['sepal_width', 'sepal_length']]
+X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']]
+y_train = df.loc[train_idx, 'petal_width']
+
+# Condition the model on sepal width and length, predict the petal width
+model = LinearRegression()
+model.fit(X_train, y_train)
+df['prediction'] = model.predict(X)
+
+fig = px.scatter(
+    df, x='petal_width', y='prediction',
+    marginal_x='histogram', marginal_y='histogram',
+    color='split', trendline='ols'
+)
+fig.add_shape(
+    type="line", line=dict(dash='dash'),
+    x0=y.min(), y0=y.min(), 
+    x1=y.max(), y1=y.max()
+)
+
+fig.show()
+```
 
 ## Residual Plots
 
+Just like prediction error plots, it's easy to visualize your prediction residuals in just a few lines of codes using `plotly.express` built-in capabilities.
+
+```python
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+
+df = px.data.iris()
+
+# Split data into training and test splits
+train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0)
+df['split'] = 'train'
+df.loc[test_idx, 'split'] = 'test'
+
+X = df[['sepal_width', 'sepal_length']]
+X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']]
+y_train = df.loc[train_idx, 'petal_width']
+
+# Condition the model on sepal width and length, predict the petal width
+model = LinearRegression()
+model.fit(X_train, y_train)
+df['prediction'] = model.predict(X)
+df['residual'] = df['prediction'] - df['petal_width']
+
+fig = px.scatter(
+    df, x='prediction', y='residual',
+    marginal_y='violin',
+    color='split', trendline='ols'
+)
+fig.show()
+```
+
+## Grid Search Visualization using `px` facets
+
 ```python
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.model_selection import GridSearchCV
+from sklearn.tree import DecisionTreeRegressor
 
+N_FOLD = 5
+
+df = px.data.iris()
+X = df.loc[train_idx, ['sepal_width', 'sepal_length']]
+y = df.loc[train_idx, 'petal_width']
+
+model = DecisionTreeRegressor()
+param_grid = {
+    'criterion': ['mse', 'friedman_mse', 'mae'], 
+    'max_depth': range(2, 5)
+}
+grid = GridSearchCV(model, param_grid, cv=N_FOLD)
+
+grid.fit(X, y)
+grid_df = pd.DataFrame(grid.cv_results_)
+
+# Convert the wide format of the grid into the long format 
+# accepted by plotly.express
+melted = (
+    grid_df
+    .rename(columns=lambda col: col.replace('param_', ''))
+    .melt(
+        value_vars=[f'split{i}_test_score' for i in range(N_FOLD)],
+        id_vars=['rank_test_score', 'mean_test_score', 
+                 'mean_fit_time', 'criterion', 'max_depth']
+    )
+)
+
+# Convert R-Squared measure to %
+melted[['value', 'mean_test_score']] *= 100
+
+# Format the variable names for simplicity
+melted['variable'] = (
+    melted['variable']
+    .str.replace('_test_score', '')
+    .str.replace('split', '')
+)
+
+px.bar(
+    melted, x='variable', y='value', 
+    color='mean_test_score', 
+    facet_row='max_depth', 
+    facet_col='criterion',
+    title='Test Scores of Grid Search',
+    hover_data=['mean_fit_time', 'rank_test_score'],
+    labels={'variable': 'cv_split', 
+            'value': 'r_squared', 
+            'mean_test_score': "mean_r_squared"}
+)
 ```
 
 ### Reference