{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 14.5 Case Study: Multiple Linear Regression with the California Housing Dataset\n", "## 14.5.1 Loading the Dataset\n", "### Loading the Data\n", "**We added `%matplotlib inline` to enable Matplotlib in this notebook.**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "from sklearn.datasets import fetch_california_housing" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "california = fetch_california_housing()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Displaying the Dataset’s Description" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(california.DESCR)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "california.data.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "california.target.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "california.feature_names" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 14.5.2 Exploring the Data with Pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.set_option('precision', 4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.set_option('max_columns', 9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.width', None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "california_df = pd.DataFrame(california.data, \n", " columns=california.feature_names)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "california_df['MedHouseValue'] = pd.Series(california.target)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "california_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "california_df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 14.5.3 Visualizing the Features " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_df = california_df.sample(frac=0.1, random_state=17)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.set(font_scale=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.set_style('whitegrid') " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for feature in california.feature_names:\n", " plt.figure(figsize=(16, 9))\n", " sns.scatterplot(data=sample_df, x=feature, \n", " y='MedHouseValue', hue='MedHouseValue', \n", " palette='cool', legend=False)\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 14.5.4 Splitting the Data for Training and Testing " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " california.data, california.target, random_state=11)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 14.5.5 Training the Model " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "linear_regression = LinearRegression()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "linear_regression.fit(X=X_train, y=y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i, name in enumerate(california.feature_names):\n", " print(f'{name:>10}: {linear_regression.coef_[i]}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "linear_regression.intercept_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 14.5.6 Testing the Model " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predicted = linear_regression.predict(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "expected = y_test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predicted[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "expected[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 14.5.7 Visualizing the Expected vs. Predicted Prices " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['Expected'] = pd.Series(expected)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['Predicted'] = pd.Series(predicted)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "figure = plt.figure(figsize=(9, 9))\n", "\n", "axes = sns.scatterplot(data=df, x='Expected', y='Predicted', \n", " hue='Predicted', palette='cool', legend=False)\n", "\n", "start = min(expected.min(), predicted.min())\n", "\n", "end = max(expected.max(), predicted.max())\n", "\n", "axes.set_xlim(start, end)\n", "\n", "axes.set_ylim(start, end)\n", "\n", "line = plt.plot([start, end], [start, end], 'k--')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This placeholder cell was added because we had to combine \n", "# the sections snippets 37-43 for the visualization to work in Jupyter\n", "# and want the subsequent snippet numbers to match the book" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Placeholder cell " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Placeholder cell " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Placeholder cell " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Placeholder cell " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Placeholder cell " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 14.5.8 Regression Model Metrics \n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn import metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "metrics.r2_score(expected, predicted)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "metrics.mean_squared_error(expected, predicted)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 14.5.9 Choosing the Best Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import ElasticNet, Lasso, Ridge" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "estimators = {\n", " 'LinearRegression': linear_regression,\n", " 'ElasticNet': ElasticNet(),\n", " 'Lasso': Lasso(),\n", " 'Ridge': Ridge()\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import KFold, cross_val_score" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for estimator_name, estimator_object in estimators.items():\n", " kfold = KFold(n_splits=10, random_state=11, shuffle=True)\n", " scores = cross_val_score(estimator=estimator_object, \n", " X=california.data, y=california.target, cv=kfold,\n", " scoring='r2')\n", " print(f'{estimator_name:>16}: ' + \n", " f'mean of r2 scores={scores.mean():.3f}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# More Info \n", "* See **video** Lesson 14 in [**Python Fundamentals LiveLessons** on Safari Online Learning](https://learning.oreilly.com/videos/python-fundamentals/9780135917411)\n", "* See **book** Chapter 14 in [**Python for Programmers** on Safari Online Learning](https://learning.oreilly.com/library/view/python-for-programmers/9780135231364/), or see **book** Chapter 15 in **Intro to Python for Computer Science and Data Science**\n", "* Interested in a print book? Check out:\n", "\n", "| Python for Programmers | Intro to Python for Computer<br>Science and Data Science\n", "| :------ | :------\n", "| <a href=\"https://amzn.to/2VvdnxE\"><img alt=\"Python for Programmers cover\" src=\"../images/PyFPCover.png\" width=\"150\" border=\"1\"/></a> | <a href=\"https://amzn.to/2LiDCmt\"><img alt=\"Intro to Python for Computer Science and Data Science: Learning to Program with AI, Big Data and the Cloud\" src=\"../images/IntroToPythonCover.png\" width=\"159\" border=\"1\"></a>\n", "\n", ">Please **do not** purchase both books—our professional book **_Python for Programmers_** is a subset of our college textbook **_Intro to Python for Computer Science and Data Science_**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "##########################################################################\n", "# (C) Copyright 2019 by Deitel & Associates, Inc. and #\n", "# Pearson Education, Inc. All Rights Reserved. #\n", "# #\n", "# DISCLAIMER: The authors and publisher of this book have used their #\n", "# best efforts in preparing the book. These efforts include the #\n", "# development, research, and testing of the theories and programs #\n", "# to determine their effectiveness. The authors and publisher make #\n", "# no warranty of any kind, expressed or implied, with regard to these #\n", "# programs or to the documentation contained in these books. The authors #\n", "# and publisher shall not be liable in any event for incidental or #\n", "# consequential damages in connection with, or arising out of, the #\n", "# furnishing, performance, or use of these programs. #\n", "##########################################################################\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }