diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..279a902c3 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "website/plugins/ipynb"] + path = website/plugins/ipynb + url = git://github.com/danielfrg/pelican-ipynb.git +[submodule "website/plugins/pelican-plugins"] + path = website/plugins/pelican-plugins + url = git://github.com/getpelican/pelican-plugins.git diff --git a/README.md b/README.md index 12bd69048..165a2b39d 100644 --- a/README.md +++ b/README.md @@ -1,97 +1,35 @@ # Python Data Science Handbook +[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/jakevdp/PythonDataScienceHandbook/master?filepath=notebooks%2FIndex.ipynb) +[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb) + This repository contains the entire [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do), in the form of (free!) Jupyter notebooks. ![cover image](notebooks/figures/PDSH-cover.png) -The book was written and tested with Python 3.5, though older Python versions (including Python 2.7) should work in nearly all cases. +## How to Use this Book + +- Read the book in its entirety online at https://jakevdp.github.io/PythonDataScienceHandbook/ + +- Run the code using the Jupyter notebooks available in this repository's [notebooks](notebooks) directory. + +- Launch executable versions of these notebooks using [Google Colab](http://colab.research.google.com): [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb) + +- Launch a live notebook server with these notebooks using [binder](https://beta.mybinder.org/): [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/jakevdp/PythonDataScienceHandbook/master?filepath=notebooks%2FIndex.ipynb) + +- Buy the printed book through [O'Reilly Media](http://shop.oreilly.com/product/0636920034919.do) + +## About + +The book was written and tested with Python 3.5, though other Python versions (including Python 2.7) should work in nearly all cases. The book introduces the core libraries essential for working with data in Python: particularly [IPython](http://ipython.org), [NumPy](http://numpy.org), [Pandas](http://pandas.pydata.org), [Matplotlib](http://matplotlib.org), [Scikit-Learn](http://scikit-learn.org), and related packages. Familiarity with Python as a language is assumed; if you need a quick introduction to the language itself, see the free companion project, [A Whirlwind Tour of Python](https://github.com/jakevdp/WhirlwindTourOfPython): it's a fast-paced introduction to the Python language aimed at researchers and scientists. -The following listing links to the notebooks in this repository, rendered through the [nbviewer](http://nbviewer.jupyter.org) service: - ---- -## [Table of Contents](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb) - -### [Preface](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/00.00-Preface.ipynb) - -### [1. IPython: Beyond Normal Python](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.00-IPython-Beyond-Normal-Python.ipynb) -- [Help and Documentation in IPython](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.01-Help-And-Documentation.ipynb) -- [Keyboard Shortcuts in the IPython Shell](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.02-Shell-Keyboard-Shortcuts.ipynb) -- [IPython Magic Commands](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.03-Magic-Commands.ipynb) -- [Input and Output History](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.04-Input-Output-History.ipynb) -- [IPython and Shell Commands](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.05-IPython-And-Shell-Commands.ipynb) -- [Errors and Debugging](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.06-Errors-and-Debugging.ipynb) -- [Profiling and Timing Code](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.07-Timing-and-Profiling.ipynb) -- [More IPython Resources](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.08-More-IPython-Resources.ipynb) - -### [2. Introduction to NumPy](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.00-Introduction-to-NumPy.ipynb) -- [Understanding Data Types in Python](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.01-Understanding-Data-Types.ipynb) -- [The Basics of NumPy Arrays](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb) -- [Computation on NumPy Arrays: Universal Functions](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb) -- [Aggregations: Min, Max, and Everything In Between](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.04-Computation-on-arrays-aggregates.ipynb) -- [Computation on Arrays: Broadcasting](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.05-Computation-on-arrays-broadcasting.ipynb) -- [Comparisons, Masks, and Boolean Logic](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb) -- [Fancy Indexing](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.07-Fancy-Indexing.ipynb) -- [Sorting Arrays](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.08-Sorting.ipynb) -- [Structured Data: NumPy's Structured Arrays](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.09-Structured-Data-NumPy.ipynb) - -### [3. Data Manipulation with Pandas](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.00-Introduction-to-Pandas.ipynb) -- [Introducing Pandas Objects](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.01-Introducing-Pandas-Objects.ipynb) -- [Data Indexing and Selection](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.02-Data-Indexing-and-Selection.ipynb) -- [Operating on Data in Pandas](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.03-Operations-in-Pandas.ipynb) -- [Handling Missing Data](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.04-Missing-Values.ipynb) -- [Hierarchical Indexing](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.05-Hierarchical-Indexing.ipynb) -- [Combining Datasets: Concat and Append](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.06-Concat-And-Append.ipynb) -- [Combining Datasets: Merge and Join](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.07-Merge-and-Join.ipynb) -- [Aggregation and Grouping](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.08-Aggregation-and-Grouping.ipynb) -- [Pivot Tables](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.09-Pivot-Tables.ipynb) -- [Vectorized String Operations](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.10-Working-With-Strings.ipynb) -- [Working with Time Series](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.11-Working-with-Time-Series.ipynb) -- [High-Performance Pandas: eval() and query()](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.12-Performance-Eval-and-Query.ipynb) -- [Further Resources](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.13-Further-Resources.ipynb) - -### [4. Visualization with Matplotlib](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.00-Introduction-To-Matplotlib.ipynb) -- [Simple Line Plots](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.01-Simple-Line-Plots.ipynb) -- [Simple Scatter Plots](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.02-Simple-Scatter-Plots.ipynb) -- [Visualizing Errors](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.03-Errorbars.ipynb) -- [Density and Contour Plots](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.04-Density-and-Contour-Plots.ipynb) -- [Histograms, Binnings, and Density](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.05-Histograms-and-Binnings.ipynb) -- [Customizing Plot Legends](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.06-Customizing-Legends.ipynb) -- [Customizing Colorbars](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.07-Customizing-Colorbars.ipynb) -- [Multiple Subplots](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.08-Multiple-Subplots.ipynb) -- [Text and Annotation](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.09-Text-and-Annotation.ipynb) -- [Customizing Ticks](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.10-Customizing-Ticks.ipynb) -- [Customizing Matplotlib: Configurations and Stylesheets](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.11-Settings-and-Stylesheets.ipynb) -- [Three-Dimensional Plotting in Matplotlib](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.12-Three-Dimensional-Plotting.ipynb) -- [Geographic Data with Basemap](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.13-Geographic-Data-With-Basemap.ipynb) -- [Visualization with Seaborn](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.14-Visualization-With-Seaborn.ipynb) -- [Further Resources](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.15-Further-Resources.ipynb) - -### [5. Machine Learning](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.00-Machine-Learning.ipynb) -- [What Is Machine Learning?](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.01-What-Is-Machine-Learning.ipynb) -- [Introducing Scikit-Learn](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.02-Introducing-Scikit-Learn.ipynb) -- [Hyperparameters and Model Validation](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.03-Hyperparameters-and-Model-Validation.ipynb) -- [Feature Engineering](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.04-Feature-Engineering.ipynb) -- [In-Depth: Naive Bayes Classification](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.05-Naive-Bayes.ipynb) -- [In-Depth: Linear Regression](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.06-Linear-Regression.ipynb) -- [In-Depth: Support Vector Machines](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.07-Support-Vector-Machines.ipynb) -- [In-Depth: Decision Trees and Random Forests](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.08-Random-Forests.ipynb) -- [In-Depth: Principal Component Analysis](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.09-Principal-Component-Analysis.ipynb) -- [In-Depth: Manifold Learning](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.10-Manifold-Learning.ipynb) -- [In-Depth: k-Means Clustering](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.11-K-Means.ipynb) -- [In-Depth: Gaussian Mixture Models](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.12-Gaussian-Mixtures.ipynb) -- [In-Depth: Kernel Density Estimation](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.13-Kernel-Density-Estimation.ipynb) -- [Application: A Face Detection Pipeline](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.14-Image-Features.ipynb) -- [Further Machine Learning Resources](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.15-Learning-More.ipynb) - -### [Appendix: Figure Code](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/06.00-Figure-Code.ipynb) - ---- - -## Required Packages +See [Index.ipynb](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb) for an index of the notebooks available to accompany the text. + +## Software The code in the book was tested with Python 3.5, though most (but not all) will also work correctly with Python 2.7 and other older Python versions. diff --git a/environment.yml b/environment.yml new file mode 100644 index 000000000..247ddfccb --- /dev/null +++ b/environment.yml @@ -0,0 +1,7 @@ +name: data-science-handbook +channels: + - conda-forge +dependencies: + - python=3.5 + - pip: + - -r requirements.txt \ No newline at end of file diff --git a/notebooks/00.00-Preface.ipynb b/notebooks/00.00-Preface.ipynb index e9e8d99a8..7d635a808 100644 --- a/notebooks/00.00-Preface.ipynb +++ b/notebooks/00.00-Preface.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "| [Contents](Index.ipynb) | [IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb) >" + "| [Contents](Index.ipynb) | [IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -169,7 +172,9 @@ "metadata": {}, "source": [ "\n", - "| [Contents](Index.ipynb) | [IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb) >" + "| [Contents](Index.ipynb) | [IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/01.00-IPython-Beyond-Normal-Python.ipynb b/notebooks/01.00-IPython-Beyond-Normal-Python.ipynb index 5110398b8..5d01277e6 100644 --- a/notebooks/01.00-IPython-Beyond-Normal-Python.ipynb +++ b/notebooks/01.00-IPython-Beyond-Normal-Python.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Preface](00.00-Preface.ipynb) | [Contents](Index.ipynb) | [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) >" + "< [Preface](00.00-Preface.ipynb) | [Contents](Index.ipynb) | [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -118,7 +121,9 @@ "metadata": {}, "source": [ "\n", - "< [Preface](00.00-Preface.ipynb) | [Contents](Index.ipynb) | [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) >" + "< [Preface](00.00-Preface.ipynb) | [Contents](Index.ipynb) | [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/01.01-Help-And-Documentation.ipynb b/notebooks/01.01-Help-And-Documentation.ipynb index 47af64c88..39879ee90 100644 --- a/notebooks/01.01-Help-And-Documentation.ipynb +++ b/notebooks/01.01-Help-And-Documentation.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb) | [Contents](Index.ipynb) | [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb) >" + "< [IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb) | [Contents](Index.ipynb) | [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -321,7 +324,9 @@ "metadata": {}, "source": [ "\n", - "< [IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb) | [Contents](Index.ipynb) | [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb) >" + "< [IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb) | [Contents](Index.ipynb) | [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/01.02-Shell-Keyboard-Shortcuts.ipynb b/notebooks/01.02-Shell-Keyboard-Shortcuts.ipynb index 5018ca578..f50e9fb1c 100644 --- a/notebooks/01.02-Shell-Keyboard-Shortcuts.ipynb +++ b/notebooks/01.02-Shell-Keyboard-Shortcuts.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,15 +17,22 @@ "metadata": {}, "source": [ "\n", - "< [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) | [Contents](Index.ipynb) | [IPython Magic Commands](01.03-Magic-Commands.ipynb) >" + "< [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) | [Contents](Index.ipynb) | [IPython Magic Commands](01.03-Magic-Commands.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Keyboard Shortcuts in the IPython Shell" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Keyboard Shortcuts in the IPython Shell\n", - "\n", "If you spend any amount of time on the computer, you've probably found a use for keyboard shortcuts in your workflow.\n", "Most familiar perhaps are the Cmd-C and Cmd-V (or Ctrl-C and Ctrl-V) for copying and pasting in a wide variety of programs and systems.\n", "Power-users tend to go even further: popular text editors like Emacs, Vim, and others provide users an incredible range of operations through intricate combinations of keystrokes.\n", @@ -168,7 +176,9 @@ "metadata": {}, "source": [ "\n", - "< [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) | [Contents](Index.ipynb) | [IPython Magic Commands](01.03-Magic-Commands.ipynb) >" + "< [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) | [Contents](Index.ipynb) | [IPython Magic Commands](01.03-Magic-Commands.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/01.03-Magic-Commands.ipynb b/notebooks/01.03-Magic-Commands.ipynb index 489e0d6a7..e5ee9d164 100644 --- a/notebooks/01.03-Magic-Commands.ipynb +++ b/notebooks/01.03-Magic-Commands.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,15 +17,22 @@ "metadata": {}, "source": [ "\n", - "< [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb) | [Contents](Index.ipynb) | [Input and Output History](01.04-Input-Output-History.ipynb) >" + "< [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb) | [Contents](Index.ipynb) | [Input and Output History](01.04-Input-Output-History.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IPython Magic Commands" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# IPython Magic Commands\n", - "\n", "The previous two sections showed how IPython lets you use and explore Python efficiently and interactively.\n", "Here we'll begin discussing some of the enhancements that IPython adds on top of the normal Python syntax.\n", "These are known in IPython as *magic commands*, and are prefixed by the ``%`` character.\n", @@ -199,7 +207,9 @@ "metadata": {}, "source": [ "\n", - "< [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb) | [Contents](Index.ipynb) | [Input and Output History](01.04-Input-Output-History.ipynb) >" + "< [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb) | [Contents](Index.ipynb) | [Input and Output History](01.04-Input-Output-History.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/01.04-Input-Output-History.ipynb b/notebooks/01.04-Input-Output-History.ipynb index 4b0dcfd04..c8e5463fe 100644 --- a/notebooks/01.04-Input-Output-History.ipynb +++ b/notebooks/01.04-Input-Output-History.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,15 +17,22 @@ "metadata": {}, "source": [ "\n", - "< [IPython Magic Commands](01.03-Magic-Commands.ipynb) | [Contents](Index.ipynb) | [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb) >" + "< [IPython Magic Commands](01.03-Magic-Commands.ipynb) | [Contents](Index.ipynb) | [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Input and Output History" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Input and Output History\n", - "\n", "Previously we saw that the IPython shell allows you to access previous commands with the up and down arrow keys, or equivalently the Ctrl-p/Ctrl-n shortcuts.\n", "Additionally, in both the shell and the notebook, IPython exposes several ways to obtain the output of previous commands, as well as string versions of the commands themselves.\n", "We'll explore those here." @@ -183,7 +191,9 @@ "metadata": {}, "source": [ "\n", - "< [IPython Magic Commands](01.03-Magic-Commands.ipynb) | [Contents](Index.ipynb) | [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb) >" + "< [IPython Magic Commands](01.03-Magic-Commands.ipynb) | [Contents](Index.ipynb) | [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/01.05-IPython-And-Shell-Commands.ipynb b/notebooks/01.05-IPython-And-Shell-Commands.ipynb index 7c4f8ebb1..6fe0dd875 100644 --- a/notebooks/01.05-IPython-And-Shell-Commands.ipynb +++ b/notebooks/01.05-IPython-And-Shell-Commands.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,15 +17,22 @@ "metadata": {}, "source": [ "\n", - "< [Input and Output History](01.04-Input-Output-History.ipynb) | [Contents](Index.ipynb) | [Errors and Debugging](01.06-Errors-and-Debugging.ipynb) >" + "< [Input and Output History](01.04-Input-Output-History.ipynb) | [Contents](Index.ipynb) | [Errors and Debugging](01.06-Errors-and-Debugging.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IPython and Shell Commands" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# IPython and Shell Commands\n", - "\n", "When working interactively with the standard Python interpreter, one of the frustrations is the need to switch between multiple windows to access Python tools and system command-line tools.\n", "IPython bridges this gap, and gives you a syntax for executing shell commands directly from within the IPython terminal.\n", "The magic happens with the exclamation point: anything appearing after ``!`` on a line will be executed not by the Python kernel, but by the system command-line.\n", @@ -216,7 +224,9 @@ "metadata": {}, "source": [ "\n", - "< [Input and Output History](01.04-Input-Output-History.ipynb) | [Contents](Index.ipynb) | [Errors and Debugging](01.06-Errors-and-Debugging.ipynb) >" + "< [Input and Output History](01.04-Input-Output-History.ipynb) | [Contents](Index.ipynb) | [Errors and Debugging](01.06-Errors-and-Debugging.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/01.06-Errors-and-Debugging.ipynb b/notebooks/01.06-Errors-and-Debugging.ipynb index 0125168e4..a7625d5ef 100644 --- a/notebooks/01.06-Errors-and-Debugging.ipynb +++ b/notebooks/01.06-Errors-and-Debugging.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,15 +17,22 @@ "metadata": {}, "source": [ "\n", - "< [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb) | [Contents](Index.ipynb) | [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb) >" + "< [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb) | [Contents](Index.ipynb) | [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Errors and Debugging" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Errors and Debugging\n", - "\n", "Code development and data analysis always require a bit of trial and error, and IPython contains tools to streamline this process.\n", "This section will briefly cover some options for controlling Python's exception reporting, followed by exploring tools for debugging errors in code." ] @@ -387,7 +395,9 @@ "metadata": {}, "source": [ "\n", - "< [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb) | [Contents](Index.ipynb) | [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb) >" + "< [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb) | [Contents](Index.ipynb) | [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/01.07-Timing-and-Profiling.ipynb b/notebooks/01.07-Timing-and-Profiling.ipynb index 2a65b75a6..76f0db5cb 100644 --- a/notebooks/01.07-Timing-and-Profiling.ipynb +++ b/notebooks/01.07-Timing-and-Profiling.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,15 +17,22 @@ "metadata": {}, "source": [ "\n", - "< [Errors and Debugging](01.06-Errors-and-Debugging.ipynb) | [Contents](Index.ipynb) | [More IPython Resources](01.08-More-IPython-Resources.ipynb) >" + "< [Errors and Debugging](01.06-Errors-and-Debugging.ipynb) | [Contents](Index.ipynb) | [More IPython Resources](01.08-More-IPython-Resources.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiling and Timing Code" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Profiling and Timing Code\n", - "\n", "In the process of developing code and creating data processing pipelines, there are often trade-offs you can make between various implementations.\n", "Early in developing your algorithm, it can be counterproductive to worry about such things. As Donald Knuth famously quipped, \"We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil.\"\n", "\n", @@ -55,9 +63,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -82,9 +88,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -114,9 +118,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -143,9 +145,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -167,9 +167,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -201,9 +199,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -244,9 +240,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sum_of_lists(N):\n", @@ -267,9 +261,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -328,9 +320,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%load_ext line_profiler" @@ -346,9 +336,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%lprun -f sum_of_lists sum_of_lists(5000)" @@ -403,9 +391,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%load_ext memory_profiler" @@ -422,9 +408,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -451,9 +435,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -484,9 +466,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -537,14 +517,16 @@ "metadata": {}, "source": [ "\n", - "< [Errors and Debugging](01.06-Errors-and-Debugging.ipynb) | [Contents](Index.ipynb) | [More IPython Resources](01.08-More-IPython-Resources.ipynb) >" + "< [Errors and Debugging](01.06-Errors-and-Debugging.ipynb) | [Contents](Index.ipynb) | [More IPython Resources](01.08-More-IPython-Resources.ipynb) >\n", + "\n", + "\"Open\n" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [default]", "language": "python", "name": "python3" }, @@ -558,9 +540,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/01.08-More-IPython-Resources.ipynb b/notebooks/01.08-More-IPython-Resources.ipynb index bd6890902..ad87f002d 100644 --- a/notebooks/01.08-More-IPython-Resources.ipynb +++ b/notebooks/01.08-More-IPython-Resources.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,15 +17,22 @@ "metadata": {}, "source": [ "\n", - "< [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb) | [Contents](Index.ipynb) | [Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb) >" + "< [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb) | [Contents](Index.ipynb) | [Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# More IPython Resources" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# More IPython Resources\n", - "\n", "In this chapter, we've just scratched the surface of using IPython to enable data science tasks.\n", "Much more information is available both in print and on the Web, and here we'll list some other resources that you may find helpful." ] @@ -37,7 +45,7 @@ "\n", "- [The IPython website](http://ipython.org): The IPython website links to documentation, examples, tutorials, and a variety of other resources.\n", "- [The nbviewer website](http://nbviewer.jupyter.org/): This site shows static renderings of any IPython notebook available on the internet. The front page features some example notebooks that you can browse to see what other folks are using IPython for!\n", - "- [A Gallery of Interesting IPython Notebooks](http://github.com/ipython/ipython/wiki/A-gallery-of-interesting-IPython-Notebooks/): This ever-growing list of notebooks, powered by nbviewer, shows the depth and breadth of numerical analysis you can do with IPython. It includes everything from short examples and tutorials to full-blown courses and books composed in the notebook format!\n", + "- [A gallery of interesting Jupyter Notebooks](https://github.com/jupyter/jupyter/wiki/A-gallery-of-interesting-Jupyter-Notebooks/): This ever-growing list of notebooks, powered by nbviewer, shows the depth and breadth of numerical analysis you can do with IPython. It includes everything from short examples and tutorials to full-blown courses and books composed in the notebook format!\n", "- Video Tutorials: searching the Internet, you will find many video-recorded tutorials on IPython. I'd especially recommend seeking tutorials from the PyCon, SciPy, and PyData conferenes by Fernando Perez and Brian Granger, two of the primary creators and maintainers of IPython and Jupyter." ] }, @@ -60,7 +68,9 @@ "metadata": {}, "source": [ "\n", - "< [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb) | [Contents](Index.ipynb) | [Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb) >" + "< [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb) | [Contents](Index.ipynb) | [Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.00-Introduction-to-NumPy.ipynb b/notebooks/02.00-Introduction-to-NumPy.ipynb index c9ad97f9c..e527c4355 100644 --- a/notebooks/02.00-Introduction-to-NumPy.ipynb +++ b/notebooks/02.00-Introduction-to-NumPy.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,22 +17,33 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [More IPython Resources](01.08-More-IPython-Resources.ipynb) | [Contents](Index.ipynb) | [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb) >" + "< [More IPython Resources](01.08-More-IPython-Resources.ipynb) | [Contents](Index.ipynb) | [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ - "# Introduction to NumPy\n" + "# Introduction to NumPy" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This chapter, along with chapter 3, outlines techniques for effectively loading, storing, and manipulating in-memory data in Python.\n", "The topic is very broad: datasets can come from a wide range of sources and a wide range of formats, including be collections of documents, collections of images, collections of sound clips, collections of numerical measurements, or nearly anything else.\n", @@ -56,7 +71,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -77,7 +94,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For the pieces of the package discussed here, I'd recommend NumPy version 1.8 or later.\n", "By convention, you'll find that most people in the SciPy/PyData world will import NumPy using ``np`` as an alias:" @@ -87,7 +107,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -96,14 +118,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Throughout this chapter, and indeed the rest of the book, you'll find that this is the way we will import and use NumPy." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Reminder about Built In Documentation\n", "\n", @@ -126,10 +154,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [More IPython Resources](01.08-More-IPython-Resources.ipynb) | [Contents](Index.ipynb) | [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb) >" + "< [More IPython Resources](01.08-More-IPython-Resources.ipynb) | [Contents](Index.ipynb) | [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.01-Understanding-Data-Types.ipynb b/notebooks/02.01-Understanding-Data-Types.ipynb index 2f053aae2..82b128e48 100644 --- a/notebooks/02.01-Understanding-Data-Types.ipynb +++ b/notebooks/02.01-Understanding-Data-Types.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb) | [Contents](Index.ipynb) | [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb) >" + "< [Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb) | [Contents](Index.ipynb) | [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -796,7 +799,9 @@ "metadata": {}, "source": [ "\n", - "< [Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb) | [Contents](Index.ipynb) | [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb) >" + "< [Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb) | [Contents](Index.ipynb) | [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb b/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb index c8bde4cb7..f9dad509a 100644 --- a/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb +++ b/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb) | [Contents](Index.ipynb) | [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) >" + "< [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb) | [Contents](Index.ipynb) | [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1538,7 +1541,9 @@ "metadata": {}, "source": [ "\n", - "< [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb) | [Contents](Index.ipynb) | [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) >" + "< [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb) | [Contents](Index.ipynb) | [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb b/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb index e4be4920c..5296859e5 100644 --- a/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb +++ b/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb) | [Contents](Index.ipynb) | [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb) >" + "< [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb) | [Contents](Index.ipynb) | [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1075,7 +1078,9 @@ "metadata": {}, "source": [ "\n", - "< [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb) | [Contents](Index.ipynb) | [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb) >" + "< [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb) | [Contents](Index.ipynb) | [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.04-Computation-on-arrays-aggregates.ipynb b/notebooks/02.04-Computation-on-arrays-aggregates.ipynb index faad08614..53e6462fd 100644 --- a/notebooks/02.04-Computation-on-arrays-aggregates.ipynb +++ b/notebooks/02.04-Computation-on-arrays-aggregates.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) | [Contents](Index.ipynb) | [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb) >" + "< [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) | [Contents](Index.ipynb) | [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -406,7 +409,7 @@ "|-------------------|---------------------|-----------------------------------------------|\n", "| ``np.sum`` | ``np.nansum`` | Compute sum of elements |\n", "| ``np.prod`` | ``np.nanprod`` | Compute product of elements |\n", - "| ``np.mean`` | ``np.nanmean`` | Compute median of elements |\n", + "| ``np.mean`` | ``np.nanmean`` | Compute mean of elements |\n", "| ``np.std`` | ``np.nanstd`` | Compute standard deviation |\n", "| ``np.var`` | ``np.nanvar`` | Compute variance |\n", "| ``np.min`` | ``np.nanmin`` | Find minimum value |\n", @@ -612,7 +615,9 @@ "metadata": {}, "source": [ "\n", - "< [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) | [Contents](Index.ipynb) | [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb) >" + "< [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) | [Contents](Index.ipynb) | [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.05-Computation-on-arrays-broadcasting.ipynb b/notebooks/02.05-Computation-on-arrays-broadcasting.ipynb index b5b864e09..c1cae6ddf 100644 --- a/notebooks/02.05-Computation-on-arrays-broadcasting.ipynb +++ b/notebooks/02.05-Computation-on-arrays-broadcasting.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb) | [Contents](Index.ipynb) | [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb) >" + "< [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb) | [Contents](Index.ipynb) | [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -770,7 +773,9 @@ "metadata": {}, "source": [ "\n", - "< [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb) | [Contents](Index.ipynb) | [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb) >" + "< [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb) | [Contents](Index.ipynb) | [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb b/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb index ae7b5da78..e17269f9d 100644 --- a/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb +++ b/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb) | [Contents](Index.ipynb) | [Fancy Indexing](02.07-Fancy-Indexing.ipynb) >" + "< [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb) | [Contents](Index.ipynb) | [Fancy Indexing](02.07-Fancy-Indexing.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -69,7 +72,7 @@ "\n", "# use pandas to extract rainfall inches as a NumPy array\n", "rainfall = pd.read_csv('data/Seattle2014.csv')['PRCP'].values\n", - "inches = rainfall / 254 # 1/10mm -> inches\n", + "inches = rainfall / 254.0 # 1/10mm -> inches\n", "inches.shape" ] }, @@ -650,7 +653,7 @@ } ], "source": [ - "# are all values in each row less than 4?\n", + "# are all values in each row less than 8?\n", "np.all(x < 8, axis=1)" ] }, @@ -890,7 +893,7 @@ "source": [ "What is returned is a one-dimensional array filled with all the values that meet this condition; in other words, all the values in positions at which the mask array is ``True``.\n", "\n", - "We are then free do operate on these values as we wish.\n", + "We are then free to operate on these values as we wish.\n", "For example, we can compute some relevant statistics on our Seattle rain data:" ] }, @@ -1249,7 +1252,9 @@ "metadata": {}, "source": [ "\n", - "< [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb) | [Contents](Index.ipynb) | [Fancy Indexing](02.07-Fancy-Indexing.ipynb) >" + "< [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb) | [Contents](Index.ipynb) | [Fancy Indexing](02.07-Fancy-Indexing.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.07-Fancy-Indexing.ipynb b/notebooks/02.07-Fancy-Indexing.ipynb index b3680ffc2..00cc188a5 100644 --- a/notebooks/02.07-Fancy-Indexing.ipynb +++ b/notebooks/02.07-Fancy-Indexing.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb) | [Contents](Index.ipynb) | [Sorting Arrays](02.08-Sorting.ipynb) >" + "< [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb) | [Contents](Index.ipynb) | [Sorting Arrays](02.08-Sorting.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -709,7 +712,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You might expect that ``x[3]`` would contain the value 2, and ``x[3]`` would contain the value 3, as this is how many times each index is repeated. Why is this not the case?\n", + "You might expect that ``x[3]`` would contain the value 2, and ``x[4]`` would contain the value 3, as this is how many times each index is repeated. Why is this not the case?\n", "Conceptually, this is because ``x[i] += 1`` is meant as a shorthand of ``x[i] = x[i] + 1``. ``x[i] + 1`` is evaluated, and then the result is assigned to the indices in x.\n", "With this in mind, it is not the augmentation that happens multiple times, but the assignment, which leads to the rather nonintuitive results.\n", "\n", @@ -898,7 +901,9 @@ "metadata": {}, "source": [ "\n", - "< [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb) | [Contents](Index.ipynb) | [Sorting Arrays](02.08-Sorting.ipynb) >" + "< [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb) | [Contents](Index.ipynb) | [Sorting Arrays](02.08-Sorting.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.08-Sorting.ipynb b/notebooks/02.08-Sorting.ipynb index 1a82f745a..8be3373c0 100644 --- a/notebooks/02.08-Sorting.ipynb +++ b/notebooks/02.08-Sorting.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Fancy Indexing](02.07-Fancy-Indexing.ipynb) | [Contents](Index.ipynb) | [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) >" + "< [Fancy Indexing](02.07-Fancy-Indexing.ipynb) | [Contents](Index.ipynb) | [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -752,7 +755,9 @@ "metadata": {}, "source": [ "\n", - "< [Fancy Indexing](02.07-Fancy-Indexing.ipynb) | [Contents](Index.ipynb) | [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) >" + "< [Fancy Indexing](02.07-Fancy-Indexing.ipynb) | [Contents](Index.ipynb) | [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/02.09-Structured-Data-NumPy.ipynb b/notebooks/02.09-Structured-Data-NumPy.ipynb index 7b1f5222a..ea4ee0bec 100644 --- a/notebooks/02.09-Structured-Data-NumPy.ipynb +++ b/notebooks/02.09-Structured-Data-NumPy.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Sorting Arrays](02.08-Sorting.ipynb) | [Contents](Index.ipynb) | [Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb) >" + "< [Sorting Arrays](02.08-Sorting.ipynb) | [Contents](Index.ipynb) | [Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -569,7 +572,9 @@ "metadata": {}, "source": [ "\n", - "< [Sorting Arrays](02.08-Sorting.ipynb) | [Contents](Index.ipynb) | [Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb) >" + "< [Sorting Arrays](02.08-Sorting.ipynb) | [Contents](Index.ipynb) | [Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.00-Introduction-to-Pandas.ipynb b/notebooks/03.00-Introduction-to-Pandas.ipynb index 644a1bad0..9a5487ae9 100644 --- a/notebooks/03.00-Introduction-to-Pandas.ipynb +++ b/notebooks/03.00-Introduction-to-Pandas.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >" + "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -133,7 +136,9 @@ "metadata": {}, "source": [ "\n", - "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >" + "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.01-Introducing-Pandas-Objects.ipynb b/notebooks/03.01-Introducing-Pandas-Objects.ipynb index bdee556f5..2e5f8f7b3 100644 --- a/notebooks/03.01-Introducing-Pandas-Objects.ipynb +++ b/notebooks/03.01-Introducing-Pandas-Objects.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb) | [Contents](Index.ipynb) | [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb) >" + "< [Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb) | [Contents](Index.ipynb) | [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1529,7 +1532,9 @@ "metadata": {}, "source": [ "\n", - "< [Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb) | [Contents](Index.ipynb) | [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb) >" + "< [Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb) | [Contents](Index.ipynb) | [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.02-Data-Indexing-and-Selection.ipynb b/notebooks/03.02-Data-Indexing-and-Selection.ipynb index 7707be226..9cce1353f 100644 --- a/notebooks/03.02-Data-Indexing-and-Selection.ipynb +++ b/notebooks/03.02-Data-Indexing-and-Selection.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) | [Contents](Index.ipynb) | [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) >" + "< [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) | [Contents](Index.ipynb) | [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1570,7 +1573,9 @@ "metadata": {}, "source": [ "\n", - "< [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) | [Contents](Index.ipynb) | [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) >" + "< [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) | [Contents](Index.ipynb) | [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.03-Operations-in-Pandas.ipynb b/notebooks/03.03-Operations-in-Pandas.ipynb index ac4b1eb37..6206ac790 100644 --- a/notebooks/03.03-Operations-in-Pandas.ipynb +++ b/notebooks/03.03-Operations-in-Pandas.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb) | [Contents](Index.ipynb) | [Handling Missing Data](03.04-Missing-Values.ipynb) >" + "< [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb) | [Contents](Index.ipynb) | [Handling Missing Data](03.04-Missing-Values.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1004,7 +1007,9 @@ "metadata": {}, "source": [ "\n", - "< [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb) | [Contents](Index.ipynb) | [Handling Missing Data](03.04-Missing-Values.ipynb) >" + "< [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb) | [Contents](Index.ipynb) | [Handling Missing Data](03.04-Missing-Values.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.04-Missing-Values.ipynb b/notebooks/03.04-Missing-Values.ipynb index 6cbef56f9..180ca09e7 100644 --- a/notebooks/03.04-Missing-Values.ipynb +++ b/notebooks/03.04-Missing-Values.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) | [Contents](Index.ipynb) | [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb) >" + "< [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) | [Contents](Index.ipynb) | [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1265,7 +1268,9 @@ "metadata": {}, "source": [ "\n", - "< [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) | [Contents](Index.ipynb) | [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb) >" + "< [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) | [Contents](Index.ipynb) | [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.05-Hierarchical-Indexing.ipynb b/notebooks/03.05-Hierarchical-Indexing.ipynb index 95de26d26..1122989bb 100644 --- a/notebooks/03.05-Hierarchical-Indexing.ipynb +++ b/notebooks/03.05-Hierarchical-Indexing.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Handling Missing Data](03.04-Missing-Values.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) >" + "< [Handling Missing Data](03.04-Missing-Values.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Hierarchical Indexing\n", - "\n", + "# Hierarchical Indexing" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "Up to this point we've been focused primarily on one-dimensional and two-dimensional data, stored in Pandas ``Series`` and ``DataFrame`` objects, respectively.\n", "Often it is useful to go beyond this and store higher-dimensional data–that is, data indexed by more than one or two keys.\n", "While Pandas does provide ``Panel`` and ``Panel4D`` objects that natively handle three-dimensional and four-dimensional data (see [Aside: Panel Data](#Aside:-Panel-Data)), a far more common pattern in practice is to make use of *hierarchical indexing* (also known as *multi-indexing*) to incorporate multiple index *levels* within a single index.\n", @@ -39,7 +56,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -49,7 +68,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## A Multiply Indexed Series\n", "\n", @@ -59,7 +81,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### The bad way\n", "\n", @@ -71,7 +96,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -104,7 +131,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With this indexing scheme, you can straightforwardly index or slice the series based on this multiple index:" ] @@ -113,7 +143,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -137,7 +169,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "But the convenience ends there. For example, if you need to select all values from 2010, you'll need to do some messy (and potentially slow) munging to make it happen:" ] @@ -146,7 +181,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -169,14 +206,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This produces the desired result, but is not as clean (or as efficient for large datasets) as the slicing syntax we've grown to love in Pandas." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### The Better Way: Pandas MultiIndex\n", "Fortunately, Pandas provides a better way.\n", @@ -188,7 +231,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -210,7 +255,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice that the ``MultiIndex`` contains multiple *levels* of indexing–in this case, the state names and the years, as well as multiple *labels* for each data point which encode these levels.\n", "\n", @@ -221,7 +269,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -248,7 +298,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here the first two columns of the ``Series`` representation show the multiple index values, while the third column shows the data.\n", "Notice that some entries are missing in the first column: in this multi-index representation, any blank entry indicates the same value as the line above it." @@ -256,7 +309,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now to access all data for which the second index is 2010, we can simply use the Pandas slicing notation:" ] @@ -265,7 +321,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -288,7 +346,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The result is a singly indexed array with just the keys we're interested in.\n", "This syntax is much more convenient (and the operation is much more efficient!) than the home-spun tuple-based multi-indexing solution that we started with.\n", @@ -297,7 +358,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### MultiIndex as extra dimension\n", "\n", @@ -309,7 +373,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -363,7 +429,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Naturally, the ``stack()`` method provides the opposite operation:" ] @@ -372,7 +441,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -398,7 +469,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Seeing this, you might wonder why would we would bother with hierarchical indexing at all.\n", "The reason is simple: just as we were able to use multi-indexing to represent two-dimensional data within a one-dimensional ``Series``, we can also use it to represent data of three or more dimensions in a ``Series`` or ``DataFrame``.\n", @@ -409,7 +483,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -488,7 +564,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In addition, all the ufuncs and other functionality discussed in [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) work with hierarchical indices as well.\n", "Here we compute the fraction of people under 18 by year, given the above data:" @@ -498,7 +577,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -552,14 +633,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This allows us to easily and quickly manipulate and explore even high-dimensional data." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Methods of MultiIndex Creation\n", "\n", @@ -570,7 +657,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -635,7 +724,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The work of creating the ``MultiIndex`` is done in the background.\n", "\n", @@ -646,7 +738,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -678,14 +772,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Nevertheless, it is sometimes useful to explicitly create a ``MultiIndex``; we'll see a couple of these methods here." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Explicit MultiIndex constructors\n", "\n", @@ -697,7 +797,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -718,7 +820,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "You can construct it from a list of tuples giving the multiple index values of each point:" ] @@ -727,7 +832,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -748,7 +855,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "You can even construct it from a Cartesian product of single indices:" ] @@ -757,7 +867,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -778,7 +890,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Similarly, you can construct the ``MultiIndex`` directly using its internal encoding by passing ``levels`` (a list of lists containing available index values for each level) and ``labels`` (a list of lists that reference these labels):" ] @@ -787,7 +902,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -809,14 +926,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Any of these objects can be passed as the ``index`` argument when creating a ``Series`` or ``Dataframe``, or be passed to the ``reindex`` method of an existing ``Series`` or ``DataFrame``." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### MultiIndex level names\n", "\n", @@ -828,7 +951,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -856,14 +981,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With more involved datasets, this can be a useful way to keep track of the meaning of various index values." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### MultiIndex for columns\n", "\n", @@ -875,7 +1006,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -989,7 +1122,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here we see where the multi-indexing for both rows and columns can come in *very* handy.\n", "This is fundamentally four-dimensional data, where the dimensions are the subject, the measurement type, the year, and the visit number.\n", @@ -1000,7 +1136,9 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1069,14 +1207,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For complicated records containing multiple labeled measurements across multiple times for many subjects (people, countries, cities, etc.) use of hierarchical rows and columns can be extremely convenient!" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Indexing and Slicing a MultiIndex\n", "\n", @@ -1086,7 +1230,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Multiply indexed Series\n", "\n", @@ -1097,7 +1244,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1124,7 +1273,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We can access single elements by indexing with multiple terms:" ] @@ -1133,7 +1285,9 @@ "cell_type": "code", "execution_count": 22, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1153,7 +1307,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The ``MultiIndex`` also supports *partial indexing*, or indexing just one of the levels in the index.\n", "The result is another ``Series``, with the lower-level indices maintained:" @@ -1163,7 +1320,9 @@ "cell_type": "code", "execution_count": 23, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1186,16 +1345,21 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ - "Partial slicing is available as well, as long as the ``MultiIndex`` is sorted (see discussion in [Sorted and Unsorted Indices](Sorted-and-Unsorted-Indices)):" + "Partial slicing is available as well, as long as the ``MultiIndex`` is sorted (see discussion in [Sorted and Unsorted Indices](#Sorted-and-unsorted-indices)):" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1220,7 +1384,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With sorted indices, partial indexing can be performed on lower levels by passing an empty slice in the first index:" ] @@ -1229,7 +1396,9 @@ "cell_type": "code", "execution_count": 25, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1253,7 +1422,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Other types of indexing and selection (discussed in [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb)) work as well; for example, selection based on Boolean masks:" ] @@ -1262,7 +1434,9 @@ "cell_type": "code", "execution_count": 26, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1286,7 +1460,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Selection based on fancy indexing also works:" ] @@ -1295,7 +1472,9 @@ "cell_type": "code", "execution_count": 27, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1320,7 +1499,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Multiply indexed DataFrames\n", "\n", @@ -1332,7 +1514,9 @@ "cell_type": "code", "execution_count": 28, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1433,7 +1617,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Remember that columns are primary in a ``DataFrame``, and the syntax used for multiply indexed ``Series`` applies to the columns.\n", "For example, we can recover Guido's heart rate data with a simple operation:" @@ -1443,7 +1630,9 @@ "cell_type": "code", "execution_count": 29, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1468,7 +1657,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Also, as with the single-index case, we can use the ``loc``, ``iloc``, and ``ix`` indexers introduced in [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb). For example:" ] @@ -1477,7 +1669,9 @@ "cell_type": "code", "execution_count": 30, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1539,7 +1733,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "These indexers provide an array-like view of the underlying two-dimensional data, but each individual index in ``loc`` or ``iloc`` can be passed a tuple of multiple indices. For example:" ] @@ -1548,7 +1745,9 @@ "cell_type": "code", "execution_count": 31, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1573,7 +1772,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Working with slices within these index tuples is not especially convenient; trying to create a slice within a tuple will lead to a syntax error:" ] @@ -1582,7 +1784,9 @@ "cell_type": "code", "execution_count": 32, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1600,7 +1804,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "You could get around this by building the desired slice explicitly using Python's built-in ``slice()`` function, but a better way in this context is to use an ``IndexSlice`` object, which Pandas provides for precisely this situation.\n", "For example:" @@ -1610,7 +1817,9 @@ "cell_type": "code", "execution_count": 33, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1680,14 +1889,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "There are so many ways to interact with data in multiply indexed ``Series`` and ``DataFrame``s, and as with many tools in this book the best way to become familiar with them is to try them out!" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Rearranging Multi-Indices\n", "\n", @@ -1698,7 +1913,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Sorted and unsorted indices\n", "\n", @@ -1713,7 +1931,9 @@ "cell_type": "code", "execution_count": 34, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1743,7 +1963,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "If we try to take a partial slice of this index, it will result in an error:" ] @@ -1752,7 +1975,9 @@ "cell_type": "code", "execution_count": 35, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1774,7 +1999,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Although it is not entirely clear from the error message, this is the result of the MultiIndex not being sorted.\n", "For various reasons, partial slices and other similar operations require the levels in the ``MultiIndex`` to be in sorted (i.e., lexographical) order.\n", @@ -1786,7 +2014,9 @@ "cell_type": "code", "execution_count": 36, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1814,7 +2044,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With the index sorted in this way, partial slicing will work as expected:" ] @@ -1823,7 +2056,9 @@ "cell_type": "code", "execution_count": 37, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1848,7 +2083,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Stacking and unstacking indices\n", "\n", @@ -1859,7 +2097,9 @@ "cell_type": "code", "execution_count": 38, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1918,7 +2158,9 @@ "cell_type": "code", "execution_count": 39, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1977,7 +2219,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The opposite of ``unstack()`` is ``stack()``, which here can be used to recover the original series:" ] @@ -1986,7 +2231,9 @@ "cell_type": "code", "execution_count": 40, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2013,7 +2260,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Index setting and resetting\n", "\n", @@ -2026,7 +2276,9 @@ "cell_type": "code", "execution_count": 41, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2105,7 +2357,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Often when working with data in the real world, the raw input data looks like this and it's useful to build a ``MultiIndex`` from the column values.\n", "This can be done with the ``set_index`` method of the ``DataFrame``, which returns a multiply indexed ``DataFrame``:" @@ -2115,7 +2370,9 @@ "cell_type": "code", "execution_count": 42, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2189,14 +2446,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In practice, I find this type of reindexing to be one of the more useful patterns when encountering real-world datasets." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Data Aggregations on Multi-Indices\n", "\n", @@ -2210,7 +2473,9 @@ "cell_type": "code", "execution_count": 43, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2311,7 +2576,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Perhaps we'd like to average-out the measurements in the two visits each year. We can do this by naming the index level we'd like to explore, in this case the year:" ] @@ -2320,7 +2588,9 @@ "cell_type": "code", "execution_count": 44, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2397,7 +2667,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "By further making use of the ``axis`` keyword, we can take the mean among levels on the columns as well:" ] @@ -2406,7 +2679,9 @@ "cell_type": "code", "execution_count": 45, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2459,7 +2734,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Thus in two lines, we've been able to find the average heart rate and temperature measured among all subjects in all visits each year.\n", "This syntax is actually a short cut to the ``GroupBy`` functionality, which we will discuss in [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb).\n", @@ -2468,7 +2746,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Aside: Panel Data\n", "\n", @@ -2486,10 +2767,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Handling Missing Data](03.04-Missing-Values.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) >" + "< [Handling Missing Data](03.04-Missing-Values.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.06-Concat-And-Append.ipynb b/notebooks/03.06-Concat-And-Append.ipynb index 93e0aa729..7566c851c 100644 --- a/notebooks/03.06-Concat-And-Append.ipynb +++ b/notebooks/03.06-Concat-And-Append.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb) >" + "< [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1606,7 +1609,9 @@ "metadata": {}, "source": [ "\n", - "< [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb) >" + "< [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.07-Merge-and-Join.ipynb b/notebooks/03.07-Merge-and-Join.ipynb index aa91fa060..c46383e57 100644 --- a/notebooks/03.07-Merge-and-Join.ipynb +++ b/notebooks/03.07-Merge-and-Join.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) | [Contents](Index.ipynb) | [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb) >" + "< [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) | [Contents](Index.ipynb) | [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -3539,7 +3542,9 @@ "metadata": {}, "source": [ "\n", - "< [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) | [Contents](Index.ipynb) | [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb) >" + "< [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) | [Contents](Index.ipynb) | [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.08-Aggregation-and-Grouping.ipynb b/notebooks/03.08-Aggregation-and-Grouping.ipynb index b30bcab93..be00723d1 100644 --- a/notebooks/03.08-Aggregation-and-Grouping.ipynb +++ b/notebooks/03.08-Aggregation-and-Grouping.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb) | [Contents](Index.ipynb) | [Pivot Tables](03.09-Pivot-Tables.ipynb) >" + "< [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb) | [Contents](Index.ipynb) | [Pivot Tables](03.09-Pivot-Tables.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -2626,7 +2629,9 @@ "metadata": {}, "source": [ "\n", - "< [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb) | [Contents](Index.ipynb) | [Pivot Tables](03.09-Pivot-Tables.ipynb) >" + "< [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb) | [Contents](Index.ipynb) | [Pivot Tables](03.09-Pivot-Tables.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.09-Pivot-Tables.ipynb b/notebooks/03.09-Pivot-Tables.ipynb index 2710ff8f2..717549875 100644 --- a/notebooks/03.09-Pivot-Tables.ipynb +++ b/notebooks/03.09-Pivot-Tables.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb) | [Contents](Index.ipynb) | [Vectorized String Operations](03.10-Working-With-Strings.ipynb) >" + "< [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb) | [Contents](Index.ipynb) | [Vectorized String Operations](03.10-Working-With-Strings.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1345,7 +1348,9 @@ "metadata": {}, "source": [ "\n", - "< [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb) | [Contents](Index.ipynb) | [Vectorized String Operations](03.10-Working-With-Strings.ipynb) >" + "< [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb) | [Contents](Index.ipynb) | [Vectorized String Operations](03.10-Working-With-Strings.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.10-Working-With-Strings.ipynb b/notebooks/03.10-Working-With-Strings.ipynb index 1c64eff95..75c004b84 100644 --- a/notebooks/03.10-Working-With-Strings.ipynb +++ b/notebooks/03.10-Working-With-Strings.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Pivot Tables](03.09-Pivot-Tables.ipynb) | [Contents](Index.ipynb) | [Working with Time Series](03.11-Working-with-Time-Series.ipynb) >" + "< [Pivot Tables](03.09-Pivot-Tables.ipynb) | [Contents](Index.ipynb) | [Working with Time Series](03.11-Working-with-Time-Series.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1373,7 +1376,9 @@ "metadata": {}, "source": [ "\n", - "< [Pivot Tables](03.09-Pivot-Tables.ipynb) | [Contents](Index.ipynb) | [Working with Time Series](03.11-Working-with-Time-Series.ipynb) >" + "< [Pivot Tables](03.09-Pivot-Tables.ipynb) | [Contents](Index.ipynb) | [Working with Time Series](03.11-Working-with-Time-Series.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.11-Working-with-Time-Series.ipynb b/notebooks/03.11-Working-with-Time-Series.ipynb index 7fb30aa85..c9b4d828b 100644 --- a/notebooks/03.11-Working-with-Time-Series.ipynb +++ b/notebooks/03.11-Working-with-Time-Series.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Vectorized String Operations](03.10-Working-With-Strings.ipynb) | [Contents](Index.ipynb) | [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) >" + "< [Vectorized String Operations](03.10-Working-With-Strings.ipynb) | [Contents](Index.ipynb) | [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1926,7 +1929,9 @@ "metadata": {}, "source": [ "\n", - "< [Vectorized String Operations](03.10-Working-With-Strings.ipynb) | [Contents](Index.ipynb) | [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) >" + "< [Vectorized String Operations](03.10-Working-With-Strings.ipynb) | [Contents](Index.ipynb) | [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.12-Performance-Eval-and-Query.ipynb b/notebooks/03.12-Performance-Eval-and-Query.ipynb index 6aecf3ab2..b6e2a142b 100644 --- a/notebooks/03.12-Performance-Eval-and-Query.ipynb +++ b/notebooks/03.12-Performance-Eval-and-Query.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Working with Time Series](03.11-Working-with-Time-Series.ipynb) | [Contents](Index.ipynb) | [Further Resources](03.13-Further-Resources.ipynb) >" + "< [Working with Time Series](03.11-Working-with-Time-Series.ipynb) | [Contents](Index.ipynb) | [Further Resources](03.13-Further-Resources.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1116,7 +1119,9 @@ "metadata": {}, "source": [ "\n", - "< [Working with Time Series](03.11-Working-with-Time-Series.ipynb) | [Contents](Index.ipynb) | [Further Resources](03.13-Further-Resources.ipynb) >" + "< [Working with Time Series](03.11-Working-with-Time-Series.ipynb) | [Contents](Index.ipynb) | [Further Resources](03.13-Further-Resources.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/03.13-Further-Resources.ipynb b/notebooks/03.13-Further-Resources.ipynb index e9c47e717..16c8a8ebd 100644 --- a/notebooks/03.13-Further-Resources.ipynb +++ b/notebooks/03.13-Further-Resources.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >" + "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Further Resources\n", - "\n", + "# Further Resources" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "In this chapter, we've covered many of the basics of using Pandas effectively for data analysis.\n", "Still, much has been omitted from our discussion.\n", "To learn more about Pandas, I recommend the following resources:\n", @@ -42,10 +59,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >" + "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.00-Introduction-To-Matplotlib.ipynb b/notebooks/04.00-Introduction-To-Matplotlib.ipynb index 7c19c7da5..ebf07e3bd 100644 --- a/notebooks/04.00-Introduction-To-Matplotlib.ipynb +++ b/notebooks/04.00-Introduction-To-Matplotlib.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Further Resources](03.13-Further-Resources.ipynb) | [Contents](Index.ipynb) | [Simple Line Plots](04.01-Simple-Line-Plots.ipynb) >" + "< [Further Resources](03.13-Further-Resources.ipynb) | [Contents](Index.ipynb) | [Simple Line Plots](04.01-Simple-Line-Plots.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -498,7 +501,9 @@ "metadata": {}, "source": [ "\n", - "< [Further Resources](03.13-Further-Resources.ipynb) | [Contents](Index.ipynb) | [Simple Line Plots](04.01-Simple-Line-Plots.ipynb) >" + "< [Further Resources](03.13-Further-Resources.ipynb) | [Contents](Index.ipynb) | [Simple Line Plots](04.01-Simple-Line-Plots.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.01-Simple-Line-Plots.ipynb b/notebooks/04.01-Simple-Line-Plots.ipynb index e32b61cc7..03acda4e3 100644 --- a/notebooks/04.01-Simple-Line-Plots.ipynb +++ b/notebooks/04.01-Simple-Line-Plots.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) | [Contents](Index.ipynb) | [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb) >" + "< [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) | [Contents](Index.ipynb) | [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -613,7 +616,9 @@ "metadata": {}, "source": [ "\n", - "< [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) | [Contents](Index.ipynb) | [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb) >" + "< [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) | [Contents](Index.ipynb) | [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.02-Simple-Scatter-Plots.ipynb b/notebooks/04.02-Simple-Scatter-Plots.ipynb index de850cc00..eaf6c4249 100644 --- a/notebooks/04.02-Simple-Scatter-Plots.ipynb +++ b/notebooks/04.02-Simple-Scatter-Plots.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Simple Line Plots](04.01-Simple-Line-Plots.ipynb) | [Contents](Index.ipynb) | [Visualizing Errors](04.03-Errorbars.ipynb) >" + "< [Simple Line Plots](04.01-Simple-Line-Plots.ipynb) | [Contents](Index.ipynb) | [Visualizing Errors](04.03-Errorbars.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -325,7 +328,9 @@ "metadata": {}, "source": [ "\n", - "< [Simple Line Plots](04.01-Simple-Line-Plots.ipynb) | [Contents](Index.ipynb) | [Visualizing Errors](04.03-Errorbars.ipynb) >" + "< [Simple Line Plots](04.01-Simple-Line-Plots.ipynb) | [Contents](Index.ipynb) | [Visualizing Errors](04.03-Errorbars.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.03-Errorbars.ipynb b/notebooks/04.03-Errorbars.ipynb index c2d475d20..094ae9c89 100644 --- a/notebooks/04.03-Errorbars.ipynb +++ b/notebooks/04.03-Errorbars.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb) | [Contents](Index.ipynb) | [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb) >" + "< [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb) | [Contents](Index.ipynb) | [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -225,7 +228,9 @@ "metadata": {}, "source": [ "\n", - "< [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb) | [Contents](Index.ipynb) | [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb) >" + "< [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb) | [Contents](Index.ipynb) | [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.04-Density-and-Contour-Plots.ipynb b/notebooks/04.04-Density-and-Contour-Plots.ipynb index 6c9d34c39..3fea071b4 100644 --- a/notebooks/04.04-Density-and-Contour-Plots.ipynb +++ b/notebooks/04.04-Density-and-Contour-Plots.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Visualizing Errors](04.03-Errorbars.ipynb) | [Contents](Index.ipynb) | [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb) >" + "< [Visualizing Errors](04.03-Errorbars.ipynb) | [Contents](Index.ipynb) | [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -297,7 +300,9 @@ "metadata": {}, "source": [ "\n", - "< [Visualizing Errors](04.03-Errorbars.ipynb) | [Contents](Index.ipynb) | [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb) >" + "< [Visualizing Errors](04.03-Errorbars.ipynb) | [Contents](Index.ipynb) | [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.05-Histograms-and-Binnings.ipynb b/notebooks/04.05-Histograms-and-Binnings.ipynb index 46a119963..1873ff7c5 100644 --- a/notebooks/04.05-Histograms-and-Binnings.ipynb +++ b/notebooks/04.05-Histograms-and-Binnings.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb) | [Contents](Index.ipynb) | [Customizing Plot Legends](04.06-Customizing-Legends.ipynb) >" + "< [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb) | [Contents](Index.ipynb) | [Customizing Plot Legends](04.06-Customizing-Legends.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -362,7 +365,9 @@ "metadata": {}, "source": [ "\n", - "< [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb) | [Contents](Index.ipynb) | [Customizing Plot Legends](04.06-Customizing-Legends.ipynb) >" + "< [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb) | [Contents](Index.ipynb) | [Customizing Plot Legends](04.06-Customizing-Legends.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.06-Customizing-Legends.ipynb b/notebooks/04.06-Customizing-Legends.ipynb index c96ec240e..ada12a045 100644 --- a/notebooks/04.06-Customizing-Legends.ipynb +++ b/notebooks/04.06-Customizing-Legends.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb) | [Contents](Index.ipynb) | [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb) >" + "< [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb) | [Contents](Index.ipynb) | [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -404,7 +407,9 @@ "metadata": {}, "source": [ "\n", - "< [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb) | [Contents](Index.ipynb) | [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb) >" + "< [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb) | [Contents](Index.ipynb) | [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.07-Customizing-Colorbars.ipynb b/notebooks/04.07-Customizing-Colorbars.ipynb index 42878c426..6620f4a49 100644 --- a/notebooks/04.07-Customizing-Colorbars.ipynb +++ b/notebooks/04.07-Customizing-Colorbars.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Customizing Plot Legends](04.06-Customizing-Legends.ipynb) | [Contents](Index.ipynb) | [Multiple Subplots](04.08-Multiple-Subplots.ipynb) >" + "< [Customizing Plot Legends](04.06-Customizing-Legends.ipynb) | [Contents](Index.ipynb) | [Multiple Subplots](04.08-Multiple-Subplots.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -538,7 +541,9 @@ "metadata": {}, "source": [ "\n", - "< [Customizing Plot Legends](04.06-Customizing-Legends.ipynb) | [Contents](Index.ipynb) | [Multiple Subplots](04.08-Multiple-Subplots.ipynb) >" + "< [Customizing Plot Legends](04.06-Customizing-Legends.ipynb) | [Contents](Index.ipynb) | [Multiple Subplots](04.08-Multiple-Subplots.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.08-Multiple-Subplots.ipynb b/notebooks/04.08-Multiple-Subplots.ipynb index 4052f9313..e06195cfc 100644 --- a/notebooks/04.08-Multiple-Subplots.ipynb +++ b/notebooks/04.08-Multiple-Subplots.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb) | [Contents](Index.ipynb) | [Text and Annotation](04.09-Text-and-Annotation.ipynb) >" + "< [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb) | [Contents](Index.ipynb) | [Text and Annotation](04.09-Text-and-Annotation.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -405,7 +408,9 @@ "metadata": {}, "source": [ "\n", - "< [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb) | [Contents](Index.ipynb) | [Text and Annotation](04.09-Text-and-Annotation.ipynb) >" + "< [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb) | [Contents](Index.ipynb) | [Text and Annotation](04.09-Text-and-Annotation.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.09-Text-and-Annotation.ipynb b/notebooks/04.09-Text-and-Annotation.ipynb index f2a67ba94..621eeaed3 100644 --- a/notebooks/04.09-Text-and-Annotation.ipynb +++ b/notebooks/04.09-Text-and-Annotation.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Multiple Subplots](04.08-Multiple-Subplots.ipynb) | [Contents](Index.ipynb) | [Customizing Ticks](04.10-Customizing-Ticks.ipynb) >" + "< [Multiple Subplots](04.08-Multiple-Subplots.ipynb) | [Contents](Index.ipynb) | [Customizing Ticks](04.10-Customizing-Ticks.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -412,7 +415,9 @@ "metadata": {}, "source": [ "\n", - "< [Multiple Subplots](04.08-Multiple-Subplots.ipynb) | [Contents](Index.ipynb) | [Customizing Ticks](04.10-Customizing-Ticks.ipynb) >" + "< [Multiple Subplots](04.08-Multiple-Subplots.ipynb) | [Contents](Index.ipynb) | [Customizing Ticks](04.10-Customizing-Ticks.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.10-Customizing-Ticks.ipynb b/notebooks/04.10-Customizing-Ticks.ipynb index 2b8ac675e..b3b6a820c 100644 --- a/notebooks/04.10-Customizing-Ticks.ipynb +++ b/notebooks/04.10-Customizing-Ticks.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Text and Annotation](04.09-Text-and-Annotation.ipynb) | [Contents](Index.ipynb) | [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb) >" + "< [Text and Annotation](04.09-Text-and-Annotation.ipynb) | [Contents](Index.ipynb) | [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -474,7 +477,9 @@ "metadata": {}, "source": [ "\n", - "< [Text and Annotation](04.09-Text-and-Annotation.ipynb) | [Contents](Index.ipynb) | [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb) >" + "< [Text and Annotation](04.09-Text-and-Annotation.ipynb) | [Contents](Index.ipynb) | [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.11-Settings-and-Stylesheets.ipynb b/notebooks/04.11-Settings-and-Stylesheets.ipynb index 57441a090..bc8b6bcde 100644 --- a/notebooks/04.11-Settings-and-Stylesheets.ipynb +++ b/notebooks/04.11-Settings-and-Stylesheets.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Customizing Ticks](04.10-Customizing-Ticks.ipynb) | [Contents](Index.ipynb) | [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb) >" + "< [Customizing Ticks](04.10-Customizing-Ticks.ipynb) | [Contents](Index.ipynb) | [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -619,7 +622,9 @@ "metadata": {}, "source": [ "\n", - "< [Customizing Ticks](04.10-Customizing-Ticks.ipynb) | [Contents](Index.ipynb) | [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb) >" + "< [Customizing Ticks](04.10-Customizing-Ticks.ipynb) | [Contents](Index.ipynb) | [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.12-Three-Dimensional-Plotting.ipynb b/notebooks/04.12-Three-Dimensional-Plotting.ipynb index d4b687bc7..ffdc3d875 100644 --- a/notebooks/04.12-Three-Dimensional-Plotting.ipynb +++ b/notebooks/04.12-Three-Dimensional-Plotting.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb) | [Contents](Index.ipynb) | [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) >" + "< [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb) | [Contents](Index.ipynb) | [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -569,7 +572,9 @@ "metadata": {}, "source": [ "\n", - "< [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb) | [Contents](Index.ipynb) | [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) >" + "< [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb) | [Contents](Index.ipynb) | [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.13-Geographic-Data-With-Basemap.ipynb b/notebooks/04.13-Geographic-Data-With-Basemap.ipynb index 6eac505e8..b724893ad 100644 --- a/notebooks/04.13-Geographic-Data-With-Basemap.ipynb +++ b/notebooks/04.13-Geographic-Data-With-Basemap.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb) | [Contents](Index.ipynb) | [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) >" + "< [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb) | [Contents](Index.ipynb) | [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -716,7 +719,9 @@ "metadata": {}, "source": [ "\n", - "< [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb) | [Contents](Index.ipynb) | [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) >" + "< [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb) | [Contents](Index.ipynb) | [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.14-Visualization-With-Seaborn.ipynb b/notebooks/04.14-Visualization-With-Seaborn.ipynb index a16416ef1..21817be21 100644 --- a/notebooks/04.14-Visualization-With-Seaborn.ipynb +++ b/notebooks/04.14-Visualization-With-Seaborn.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) | [Contents](Index.ipynb) | [Further Resources](04.15-Further-Resources.ipynb) >" + "< [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) | [Contents](Index.ipynb) | [Further Resources](04.15-Further-Resources.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1119,9 +1122,11 @@ } ], "source": [ + "import datetime\n", + "\n", "def convert_time(s):\n", " h, m, s = map(int, s.split(':'))\n", - " return pd.datetools.timedelta(hours=h, minutes=m, seconds=s)\n", + " return datetime.timedelta(hours=h, minutes=m, seconds=s)\n", "\n", "data = pd.read_csv('marathon-data.csv',\n", " converters={'split':convert_time, 'final':convert_time})\n", @@ -1763,7 +1768,9 @@ "metadata": {}, "source": [ "\n", - "< [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) | [Contents](Index.ipynb) | [Further Resources](04.15-Further-Resources.ipynb) >" + "< [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) | [Contents](Index.ipynb) | [Further Resources](04.15-Further-Resources.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/04.15-Further-Resources.ipynb b/notebooks/04.15-Further-Resources.ipynb index 1b0101f73..4aed29225 100644 --- a/notebooks/04.15-Further-Resources.ipynb +++ b/notebooks/04.15-Further-Resources.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >" + "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -64,7 +67,9 @@ "metadata": {}, "source": [ "\n", - "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >" + "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.00-Machine-Learning.ipynb b/notebooks/05.00-Machine-Learning.ipynb index fc9b2815d..caff9877c 100644 --- a/notebooks/05.00-Machine-Learning.ipynb +++ b/notebooks/05.00-Machine-Learning.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [Further Resources](04.15-Further-Resources.ipynb) | [Contents](Index.ipynb) | [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) >" + "< [Further Resources](04.15-Further-Resources.ipynb) | [Contents](Index.ipynb) | [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -57,7 +60,9 @@ "metadata": {}, "source": [ "\n", - "< [Further Resources](04.15-Further-Resources.ipynb) | [Contents](Index.ipynb) | [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) >" + "< [Further Resources](04.15-Further-Resources.ipynb) | [Contents](Index.ipynb) | [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.01-What-Is-Machine-Learning.ipynb b/notebooks/05.01-What-Is-Machine-Learning.ipynb index a26f207be..1dd061dae 100644 --- a/notebooks/05.01-What-Is-Machine-Learning.ipynb +++ b/notebooks/05.01-What-Is-Machine-Learning.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Machine Learning](05.00-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb) >" + "< [Machine Learning](05.00-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# What Is Machine Learning?\n", - "\n", + "# What Is Machine Learning?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "Before we take a look at the details of various machine learning methods, let's start by looking at what machine learning is, and what it isn't.\n", "Machine learning is often categorized as a subfield of artificial intelligence, but I find that categorization can often be misleading at first brush.\n", "The study of machine learning certainly arose from research in this context, but in the data science application of machine learning methods, it's more helpful to think of machine learning as a means of *building models of data*.\n", @@ -39,7 +56,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Categories of Machine Learning\n", "\n", @@ -60,7 +80,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Qualitative Examples of Machine Learning Applications\n", "\n", @@ -72,7 +95,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Classification: Predicting discrete labels\n", "\n", @@ -83,7 +109,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-classification-1.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Classification-Example-Figure-1)" @@ -91,7 +120,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here we have two-dimensional data: that is, we have two *features* for each point, represented by the *(x,y)* positions of the points on the plane.\n", "In addition, we have one of two *class labels* for each point, here represented by the colors of the points.\n", @@ -106,7 +138,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-classification-2.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Classification-Example-Figure-2)" @@ -114,7 +149,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now that this model has been trained, it can be generalized to new, unlabeled data.\n", "In other words, we can take a new set of data, draw this model line through it, and assign labels to the new points based on this model.\n", @@ -123,7 +161,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-classification-3.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Classification-Example-Figure-3)" @@ -131,7 +172,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This is the basic idea of a classification task in machine learning, where \"classification\" indicates that the data has discrete class labels.\n", "At first glance this may look fairly trivial: it would be relatively easy to simply look at this data and draw such a discriminatory line to accomplish this classification.\n", @@ -151,7 +195,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Regression: Predicting continuous labels\n", "\n", @@ -162,7 +209,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-regression-1.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Regression-Example-Figure-1)" @@ -170,7 +220,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "As with the classification example, we have two-dimensional data: that is, there are two features describing each data point.\n", "The color of each point represents the continuous label for that point.\n", @@ -184,7 +237,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-regression-2.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Regression-Example-Figure-2)" @@ -192,7 +248,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice that the *feature 1-feature 2* plane here is the same as in the two-dimensional plot from before; in this case, however, we have represented the labels by both color and three-dimensional axis position.\n", "From this view, it seems reasonable that fitting a plane through this three-dimensional data would allow us to predict the expected label for any set of input parameters.\n", @@ -201,7 +260,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-regression-3.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Regression-Example-Figure-3)" @@ -209,7 +271,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This plane of fit gives us what we need to predict labels for new points.\n", "Visually, we find the results shown in the following figure:" @@ -217,7 +282,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-regression-4.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Regression-Example-Figure-4)" @@ -225,7 +293,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "As with the classification example, this may seem rather trivial in a low number of dimensions.\n", "But the power of these methods is that they can be straightforwardly applied and evaluated in the case of data with many, many features.\n", @@ -244,7 +315,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Clustering: Inferring labels on unlabeled data\n", "\n", @@ -257,7 +331,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-clustering-1.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Clustering-Example-Figure-2)" @@ -265,7 +342,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "By eye, it is clear that each of these points is part of a distinct group.\n", "Given this input, a clustering model will use the intrinsic structure of the data to determine which points are related.\n", @@ -274,7 +354,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-clustering-2.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Clustering-Example-Figure-2)" @@ -282,7 +365,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "*k*-means fits a model consisting of *k* cluster centers; the optimal centers are assumed to be those that minimize the distance of each point from its assigned center.\n", "Again, this might seem like a trivial exercise in two dimensions, but as our data becomes larger and more complex, such clustering algorithms can be employed to extract useful information from the dataset.\n", @@ -293,7 +379,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Dimensionality reduction: Inferring structure of unlabeled data\n", "\n", @@ -306,7 +395,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-dimesionality-1.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Dimensionality-Reduction-Example-Figure-1)" @@ -314,7 +406,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Visually, it is clear that there is some structure in this data: it is drawn from a one-dimensional line that is arranged in a spiral within this two-dimensional space.\n", "In a sense, you could say that this data is \"intrinsically\" only one dimensional, though this one-dimensional data is embedded in higher-dimensional space.\n", @@ -325,7 +420,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.01-dimesionality-2.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Dimensionality-Reduction-Example-Figure-2)" @@ -333,7 +431,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice that the colors (which represent the extracted one-dimensional latent variable) change uniformly along the spiral, which indicates that the algorithm did in fact detect the structure we saw by eye.\n", "As with the previous examples, the power of dimensionality reduction algorithms becomes clearer in higher-dimensional cases.\n", @@ -345,7 +446,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Summary\n", "\n", @@ -371,10 +475,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Machine Learning](05.00-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb) >" + "< [Machine Learning](05.00-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.02-Introducing-Scikit-Learn.ipynb b/notebooks/05.02-Introducing-Scikit-Learn.ipynb index d144bfabd..8d3ecb877 100644 --- a/notebooks/05.02-Introducing-Scikit-Learn.ipynb +++ b/notebooks/05.02-Introducing-Scikit-Learn.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) >" + "< [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Introducing Scikit-Learn\n", - "\n", + "# Introducing Scikit-Learn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "There are several Python libraries which provide solid implementations of a range of machine learning algorithms.\n", "One of the best known is [Scikit-Learn](http://scikit-learn.org), a package that provides efficient versions of a large number of common algorithms.\n", "Scikit-Learn is characterized by a clean, uniform, and streamlined API, as well as by very useful and complete online documentation.\n", @@ -37,14 +54,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Data Representation in Scikit-Learn" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Machine learning is about creating models from data: for that reason, we'll start by discussing how data can be represented in order to be understood by the computer.\n", "The best way to think about data within Scikit-Learn is in terms of tables of data." @@ -52,7 +75,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Data as table\n", "\n", @@ -65,7 +91,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -150,7 +178,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here each row of the data refers to a single observed flower, and the number of rows is the total number of flowers in the dataset.\n", "In general, we will refer to the rows of the matrix as *samples*, and the number of rows as ``n_samples``.\n", @@ -161,7 +192,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Features matrix\n", "\n", @@ -178,7 +212,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Target array\n", "\n", @@ -197,7 +234,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -219,7 +258,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For use in Scikit-Learn, we will extract the features matrix and target array from the ``DataFrame``, which we can do using some of the Pandas ``DataFrame`` operations discussed in the [Chapter 3](03.00-Introduction-to-Pandas.ipynb):" ] @@ -228,7 +270,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -251,7 +295,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -272,14 +318,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To summarize, the expected layout of features and target values is visualized in the following diagram:" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.02-samples-features.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Features-and-Labels-Grid)" @@ -287,21 +339,30 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With this data properly formatted, we can move on to consider the *estimator* API of Scikit-Learn:" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Scikit-Learn's Estimator API" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The Scikit-Learn API is designed with the following guiding principles in mind, as outlined in the [Scikit-Learn API paper](http://arxiv.org/abs/1309.0238):\n", "\n", @@ -324,7 +385,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Basics of the API\n", "\n", @@ -344,7 +408,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Supervised learning example: Simple linear regression\n", "\n", @@ -356,7 +423,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -382,14 +451,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With this data in place, we can use the recipe outlined earlier. Let's walk through the process: " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### 1. Choose a class of model\n", "\n", @@ -401,7 +476,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -410,14 +487,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Note that other more general linear regression models exist as well; you can read more about them in the [``sklearn.linear_model`` module documentation](http://Scikit-Learn.org/stable/modules/linear_model.html)." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### 2. Choose model hyperparameters\n", "\n", @@ -444,7 +527,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -465,7 +550,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Keep in mind that when the model is instantiated, the only action is the storing of these hyperparameter values.\n", "In particular, we have not yet applied the model to any data: the Scikit-Learn API makes very clear the distinction between *choice of model* and *application of model to data*." @@ -473,7 +561,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### 3. Arrange data into a features matrix and target vector\n", "\n", @@ -486,7 +577,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -507,7 +600,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### 4. Fit the model to your data\n", "\n", @@ -519,7 +615,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -539,7 +637,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This ``fit()`` command causes a number of model-dependent internal computations to take place, and the results of these computations are stored in model-specific attributes that the user can explore.\n", "In Scikit-Learn, by convention all model parameters that were learned during the ``fit()`` process have trailing underscores; for example in this linear model, we have the following:" @@ -549,7 +650,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -571,7 +674,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -591,7 +696,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "These two parameters represent the slope and intercept of the simple linear fit to the data.\n", "Comparing to the data definition, we see that they are very close to the input slope of 2 and intercept of -1.\n", @@ -604,7 +712,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### 5. Predict labels for unknown data\n", "\n", @@ -617,7 +728,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -626,7 +739,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "As before, we need to coerce these *x* values into a ``[n_samples, n_features]`` features matrix, after which we can feed it to the model:" ] @@ -635,7 +751,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -645,7 +763,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, let's visualize the results by plotting first the raw data, and then this model fit:" ] @@ -654,7 +775,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -675,14 +798,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Typically the efficacy of the model is evaluated by comparing its results to some known baseline, as we will see in the next example" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Supervised learning example: Iris classification\n", "\n", @@ -700,7 +829,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -711,7 +842,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With the data arranged, we can follow our recipe to predict the labels:" ] @@ -720,7 +854,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -732,7 +868,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, we can use the ``accuracy_score`` utility to see the fraction of predicted labels that match their true value:" ] @@ -741,7 +880,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -762,14 +903,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With an accuracy topping 97%, we see that even this very naive classification algorithm is effective for this particular dataset!" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Unsupervised learning example: Iris dimensionality\n", "\n", @@ -789,7 +936,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -801,7 +950,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now let's plot the results. A quick way to do this is to insert the results into the original Iris ``DataFrame``, and use Seaborn's ``lmplot`` to show the results:" ] @@ -810,7 +962,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -832,7 +986,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see that in the two-dimensional representation, the species are fairly well separated, even though the PCA algorithm had no knowledge of the species labels!\n", "This indicates to us that a relatively straightforward classification will probably be effective on the dataset, as we saw before." @@ -840,7 +997,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Unsupervised learning: Iris clustering\n", "\n", @@ -856,7 +1016,9 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -869,7 +1031,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "As before, we will add the cluster label to the Iris ``DataFrame`` and use Seaborn to plot the results:" ] @@ -878,7 +1043,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -900,7 +1067,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "By splitting the data by cluster number, we see exactly how well the GMM algorithm has recovered the underlying label: the *setosa* species is separated perfectly within cluster 0, while there remains a small amount of mixing between *versicolor* and *virginica*.\n", "This means that even without an expert to tell us the species labels of the individual flowers, the measurements of these flowers are distinct enough that we could *automatically* identify the presence of these different groups of species with a simple clustering algorithm!\n", @@ -909,14 +1079,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Application: Exploring Hand-written Digits" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To demonstrate these principles on a more interesting problem, let's consider one piece of the optical character recognition problem: the identification of hand-written digits.\n", "In the wild, this problem involves both locating and identifying characters in an image. Here we'll take a shortcut and use Scikit-Learn's set of pre-formatted digits, which is built into the library." @@ -924,7 +1100,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Loading and visualizing the digits data\n", "\n", @@ -935,7 +1114,9 @@ "cell_type": "code", "execution_count": 22, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -957,7 +1138,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The images data is a three-dimensional array: 1,797 samples each consisting of an 8 × 8 grid of pixels.\n", "Let's visualize the first hundred of these:" @@ -967,7 +1151,9 @@ "cell_type": "code", "execution_count": 23, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -996,7 +1182,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In order to work with this data within Scikit-Learn, we need a two-dimensional, ``[n_samples, n_features]`` representation.\n", "We can accomplish this by treating each pixel in the image as a feature: that is, by flattening out the pixel arrays so that we have a length-64 array of pixel values representing each digit.\n", @@ -1008,7 +1197,9 @@ "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1031,7 +1222,9 @@ "cell_type": "code", "execution_count": 25, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1052,14 +1245,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see here that there are 1,797 samples and 64 features." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Unsupervised learning: Dimensionality reduction\n", "\n", @@ -1072,7 +1271,9 @@ "cell_type": "code", "execution_count": 26, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1096,7 +1297,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see that the projected data is now two-dimensional.\n", "Let's plot this data to see if we can learn anything from its structure:" @@ -1106,7 +1310,9 @@ "cell_type": "code", "execution_count": 27, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1130,7 +1336,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This plot gives us some good intuition into how well various numbers are separated in the larger 64-dimensional space. For example, zeros (in black) and ones (in purple) have very little overlap in parameter space.\n", "Intuitively, this makes sense: a zero is empty in the middle of the image, while a one will generally have ink in the middle.\n", @@ -1142,7 +1351,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Classification on digits\n", "\n", @@ -1154,7 +1366,9 @@ "cell_type": "code", "execution_count": 28, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1165,7 +1379,9 @@ "cell_type": "code", "execution_count": 29, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1177,7 +1393,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now that we have predicted our model, we can gauge its accuracy by comparing the true values of the test set to the predictions:" ] @@ -1186,7 +1405,9 @@ "cell_type": "code", "execution_count": 30, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1207,7 +1428,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With even this extremely simple model, we find about 80% accuracy for classification of the digits!\n", "However, this single number doesn't tell us *where* we've gone wrong—one nice way to do this is to use the *confusion matrix*, which we can compute with Scikit-Learn and plot with Seaborn:" @@ -1217,7 +1441,9 @@ "cell_type": "code", "execution_count": 31, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1243,7 +1469,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This shows us where the mis-labeled points tend to be: for example, a large number of twos here are mis-classified as either ones or eights.\n", "Another way to gain intuition into the characteristics of the model is to plot the inputs again, with their predicted labels.\n", @@ -1254,7 +1483,9 @@ "cell_type": "code", "execution_count": 32, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1284,7 +1515,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Examining this subset of the data, we can gain insight regarding where the algorithm might be not performing optimally.\n", "To go beyond our 80% classification rate, we might move to a more sophisticated algorithm such as support vector machines (see [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb)), random forests (see [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb)) or another classification approach." @@ -1292,14 +1526,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Summary" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In this section we have covered the essential features of the Scikit-Learn data representation, and the estimator API.\n", "Regardless of the type of estimator, the same import/instantiate/fit/predict pattern holds.\n", @@ -1310,10 +1550,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) >" + "< [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.03-Hyperparameters-and-Model-Validation.ipynb b/notebooks/05.03-Hyperparameters-and-Model-Validation.ipynb index a74734983..3edcada26 100644 --- a/notebooks/05.03-Hyperparameters-and-Model-Validation.ipynb +++ b/notebooks/05.03-Hyperparameters-and-Model-Validation.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,20 +17,32 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb) | [Contents](Index.ipynb) | [Feature Engineering](05.04-Feature-Engineering.ipynb) >" + "< [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb) | [Contents](Index.ipynb) | [Feature Engineering](05.04-Feature-Engineering.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hyperparameters and Model Validation" ] }, { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ - "# Hyperparameters and Model Validation\n", - "\n", "In the previous section, we saw the basic recipe for applying a supervised machine learning model:\n", "\n", "1. Choose a class of model\n", @@ -41,7 +57,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Thinking about Model Validation\n", "\n", @@ -54,7 +73,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Model validation the wrong way\n", "\n", @@ -66,7 +88,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -78,7 +102,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next we choose a model and hyperparameters. Here we'll use a *k*-neighbors classifier with ``n_neighbors=1``.\n", "This is a very simple and intuitive model that says \"the label of an unknown point is the same as the label of its closest training point:\"" @@ -88,7 +115,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -98,7 +127,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Then we train the model, and use it to predict labels for data we already know:" ] @@ -107,7 +139,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -117,7 +151,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, we compute the fraction of correctly labeled points:" ] @@ -126,7 +163,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -147,7 +186,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see an accuracy score of 1.0, which indicates that 100% of points were correctly labeled by our model!\n", "But is this truly measuring the expected accuracy? Have we really come upon a model that we expect to be correct 100% of the time?\n", @@ -159,7 +201,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Model validation the right way: Holdout sets\n", "\n", @@ -172,7 +217,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -202,7 +249,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see here a more reasonable result: the nearest-neighbor classifier is about 90% accurate on this hold-out set.\n", "The hold-out set is similar to unknown data, because the model has not \"seen\" it before." @@ -210,7 +260,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Model validation via cross-validation\n", "\n", @@ -232,7 +285,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -254,7 +309,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "What comes out are two accuracy scores, which we could combine (by, say, taking the mean) to get a better measure of the global model performance.\n", "This particular form of cross-validation is a *two-fold cross-validation*—that is, one in which we have split the data into two sets and used each in turn as a validation set.\n", @@ -272,7 +330,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -293,7 +353,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Repeating the validation across different subsets of the data gives us an even better idea of the performance of the algorithm.\n", "\n", @@ -306,7 +369,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -339,7 +404,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Because we have 150 samples, the leave one out cross-validation yields scores for 150 trials, and the score indicates either successful (1.0) or unsuccessful (0.0) prediction.\n", "Taking the mean of these gives an estimate of the error rate:" @@ -349,7 +417,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -369,7 +439,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Other cross-validation schemes can be used similarly.\n", "For a description of what is available in Scikit-Learn, use IPython to explore the ``sklearn.cross_validation`` submodule, or take a look at Scikit-Learn's online [cross-validation documentation](http://scikit-learn.org/stable/modules/cross_validation.html)." @@ -377,7 +450,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Selecting the Best Model\n", "\n", @@ -399,7 +475,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### The Bias-variance trade-off\n", "\n", @@ -422,7 +501,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To look at this in another light, consider what happens if we use these two models to predict the y-value for some new data.\n", "In the following diagrams, the red/lighter points indicate data that is omitted from the training set:\n", @@ -439,7 +521,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "If we imagine that we have some ability to tune the model complexity, we would expect the training score and validation score to behave as illustrated in the following figure:\n", "\n", @@ -459,7 +544,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "### Validation curves in Scikit-Learn\n", @@ -487,7 +574,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -503,7 +592,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "Now let's create some data to which we will fit our model:" @@ -513,7 +604,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -533,7 +626,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We can now visualize our data, along with polynomial fits of several degrees:" ] @@ -542,7 +638,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -575,7 +673,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The knob controlling model complexity in this case is the degree of the polynomial, which can be any non-negative integer.\n", "A useful question to answer is this: what degree of polynomial provides a suitable trade-off between bias (under-fitting) and variance (over-fitting)?\n", @@ -588,7 +689,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -618,7 +721,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This shows precisely the qualitative behavior we expect: the training score is everywhere higher than the validation score; the training score is monotonically improving with increased model complexity; and the validation score reaches a maximum before dropping off as the model becomes over-fit.\n", "\n", @@ -629,7 +735,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -653,14 +761,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice that finding this optimal model did not actually require us to compute the training score, but examining the relationship between the training score and validation score can give us useful insight into the performance of the model." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Learning Curves\n", "\n", @@ -672,7 +786,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -693,7 +809,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We will duplicate the preceding code to plot the validation curve for this larger dataset; for reference let's over-plot the previous results as well:" ] @@ -702,7 +821,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -733,7 +854,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The solid lines show the new results, while the fainter dashed lines show the results of the previous smaller dataset.\n", "It is clear from the validation curve that the larger dataset can support a much more complicated model: the peak here is probably around a degree of 6, but even a degree-20 model is not seriously over-fitting the data—the validation and training scores remain very close.\n", @@ -753,7 +877,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.03-learning-curve.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Learning-Curve)" @@ -761,7 +888,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The notable feature of the learning curve is the convergence to a particular score as the number of training samples grows.\n", "In particular, once you have enough points that a particular model has converged, *adding more training data will not help you!*\n", @@ -770,7 +900,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Learning curves in Scikit-Learn\n", "\n", @@ -781,7 +914,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -821,7 +956,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This is a valuable diagnostic, because it gives us a visual depiction of how our model responds to increasing training data.\n", "In particular, when your learning curve has already converged (i.e., when the training and validation curves are already close to each other) *adding more training data will not significantly improve the fit!*\n", @@ -836,7 +974,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Validation in Practice: Grid Search\n", "\n", @@ -854,7 +995,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -869,7 +1012,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice that like a normal estimator, this has not yet been applied to any data.\n", "Calling the ``fit()`` method will fit the model at each grid point, keeping track of the scores along the way:" @@ -879,7 +1025,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -888,7 +1036,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now that this is fit, we can ask for the best parameters as follows:" ] @@ -897,7 +1048,9 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -919,7 +1072,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, if we wish, we can use the best model and show the fit to our data using code from before:" ] @@ -928,7 +1084,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -954,7 +1112,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The grid search provides many more options, including the ability to specify a custom scoring function, to parallelize the computations, to do randomized searches, and more.\n", "For information, see the examples in [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) and [Feature Engineering: Working with Images](05.14-Image-Features.ipynb), or refer to Scikit-Learn's [grid search documentation](http://Scikit-Learn.org/stable/modules/grid_search.html)." @@ -962,7 +1123,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Summary\n", "\n", @@ -975,10 +1139,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb) | [Contents](Index.ipynb) | [Feature Engineering](05.04-Feature-Engineering.ipynb) >" + "< [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb) | [Contents](Index.ipynb) | [Feature Engineering](05.04-Feature-Engineering.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.04-Feature-Engineering.ipynb b/notebooks/05.04-Feature-Engineering.ipynb index c407d4525..7315fb277 100644 --- a/notebooks/05.04-Feature-Engineering.ipynb +++ b/notebooks/05.04-Feature-Engineering.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) | [Contents](Index.ipynb) | [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) >" + "< [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) | [Contents](Index.ipynb) | [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Feature Engineering\n", - "\n", + "# Feature Engineering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "The previous sections outline the fundamental ideas of machine learning, but all of the examples assume that you have numerical data in a tidy, ``[n_samples, n_features]`` format.\n", "In the real world, data rarely comes in such a form.\n", "With this in mind, one of the more important steps in using machine learning in practice is *feature engineering*: that is, taking whatever information you have about your problem and turning it into numbers that you can use to build your feature matrix.\n", @@ -36,7 +53,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Categorical Features\n", "\n", @@ -49,7 +69,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -63,7 +85,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "You might be tempted to encode this data with a straightforward numerical mapping:" ] @@ -72,7 +97,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -81,7 +108,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "It turns out that this is not generally a useful approach in Scikit-Learn: the package's models make the fundamental assumption that numerical features reflect algebraic quantities.\n", "Thus such a mapping would imply, for example, that *Queen Anne < Fremont < Wallingford*, or even that *Wallingford - Queen Anne = Fremont*, which (niche demographic jokes aside) does not make much sense.\n", @@ -94,7 +124,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -119,7 +151,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice that the 'neighborhood' column has been expanded into three separate columns, representing the three neighborhood labels, and that each row has a 1 in the column associated with its neighborhood.\n", "With these categorical features thus encoded, you can proceed as normal with fitting a Scikit-Learn model.\n", @@ -131,7 +166,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -155,7 +192,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "There is one clear disadvantage of this approach: if your category has many possible values, this can *greatly* increase the size of your dataset.\n", "However, because the encoded data contains mostly zeros, a sparse output can be a very efficient solution:" @@ -165,7 +205,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -187,14 +229,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Many (though not yet all) of the Scikit-Learn estimators accept such sparse inputs when fitting and evaluating models. ``sklearn.preprocessing.OneHotEncoder`` and ``sklearn.feature_extraction.FeatureHasher`` are two additional tools that Scikit-Learn includes to support this type of encoding." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Text Features\n", "\n", @@ -209,7 +257,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -220,7 +270,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For a vectorization of this data based on word count, we could construct a column representing the word \"problem,\" the word \"evil,\" the word \"horizon,\" and so on.\n", "While doing this by hand would be possible, the tedium can be avoided by using Scikit-Learn's ``CountVectorizer``:" @@ -230,7 +283,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -255,7 +310,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The result is a sparse matrix recording the number of times each word appears; it is easier to inspect if we convert this to a ``DataFrame`` with labeled columns:" ] @@ -264,7 +322,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -330,7 +390,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "There are some issues with this approach, however: the raw word counts lead to features which put too much weight on words that appear very frequently, and this can be sub-optimal in some classification algorithms.\n", "One approach to fix this is known as *term frequency-inverse document frequency* (*TF–IDF*) which weights the word counts by a measure of how often they appear in the documents.\n", @@ -341,7 +404,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -409,14 +474,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For an example of using TF-IDF in a classification problem, see [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb)." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Image Features\n", "\n", @@ -430,7 +501,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Derived Features\n", "\n", @@ -446,7 +520,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -472,7 +548,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Still, we can fit a line to the data using ``LinearRegression`` and get the optimal result:" ] @@ -481,7 +560,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -506,7 +587,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "It's clear that we need a more sophisticated model to describe the relationship between $x$ and $y$.\n", "\n", @@ -518,7 +602,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -542,7 +628,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The derived feature matrix has one column representing $x$, and a second column representing $x^2$, and a third column representing $x^3$.\n", "Computing a linear regression on this expanded input gives a much closer fit to our data:" @@ -552,7 +641,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -575,7 +666,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This idea of improving a model not by changing the model, but by transforming the inputs, is fundamental to many of the more powerful machine learning methods.\n", "We explore this idea further in [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) in the context of *basis function regression*.\n", @@ -584,7 +678,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Imputation of Missing Data\n", "\n", @@ -597,7 +694,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -612,7 +711,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "When applying a typical machine learning model to such data, we will need to first replace such missing data with some appropriate fill value.\n", "This is known as *imputation* of missing values, and strategies range from simple (e.g., replacing missing values with the mean of the column) to sophisticated (e.g., using matrix completion or a robust model to handle such data).\n", @@ -625,7 +727,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -652,7 +756,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see that in the resulting data, the two missing values have been replaced with the mean of the remaining values in the column. This imputed data can then be fed directly into, for example, a ``LinearRegression`` estimator:" ] @@ -661,7 +768,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -682,7 +791,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Feature Pipelines\n", "\n", @@ -700,7 +812,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -713,7 +827,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This pipeline looks and acts like a standard Scikit-Learn object, and will apply all the specified steps to any input data." ] @@ -722,7 +839,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -742,7 +861,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "All the steps of the model are applied automatically.\n", "Notice that for the simplicity of this demonstration, we've applied the model to the data it was trained on; this is why it was able to perfectly predict the result (refer back to [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) for further discussion of this).\n", @@ -752,10 +874,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) | [Contents](Index.ipynb) | [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) >" + "< [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) | [Contents](Index.ipynb) | [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.05-Naive-Bayes.ipynb b/notebooks/05.05-Naive-Bayes.ipynb index 3de676f9a..f5d492a42 100644 --- a/notebooks/05.05-Naive-Bayes.ipynb +++ b/notebooks/05.05-Naive-Bayes.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Feature Engineering](05.04-Feature-Engineering.ipynb) | [Contents](Index.ipynb) | [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) >" + "< [Feature Engineering](05.04-Feature-Engineering.ipynb) | [Contents](Index.ipynb) | [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# In Depth: Naive Bayes Classification\n", - "\n", + "# In Depth: Naive Bayes Classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "The previous four sections have given a general overview of the concepts of machine learning.\n", "In this section and the ones that follow, we will be taking a closer look at several specific algorithms for supervised and unsupervised learning, starting here with naive Bayes classification.\n", "\n", @@ -35,7 +52,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Bayesian Classification\n", "\n", @@ -69,7 +89,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -81,7 +103,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Gaussian Naive Bayes\n", "\n", @@ -94,7 +119,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -116,7 +143,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "One extremely fast way to create a simple model is to assume that the data is described by a Gaussian distribution with no covariance between dimensions.\n", "This model can be fit by simply finding the mean and standard deviation of the points within each label, which is all you need to define such a distribution.\n", @@ -125,7 +155,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![(run code in Appendix to generate image)](figures/05.05-gaussian-NB.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Gaussian-Naive-Bayes)" @@ -134,7 +167,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "The ellipses here represent the Gaussian generative model for each label, with larger probability toward the center of the ellipses.\n", @@ -147,7 +182,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -158,7 +195,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now let's generate some new data and predict the label:" ] @@ -167,7 +207,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -178,7 +220,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now we can plot this new data to get an idea of where the decision boundary is:" ] @@ -187,7 +232,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -210,7 +257,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see a slightly curved boundary in the classifications—in general, the boundary in Gaussian naive Bayes is quadratic.\n", "\n", @@ -221,7 +271,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -249,7 +301,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The columns give the posterior probabilities of the first and second label, respectively.\n", "If you are looking for estimates of uncertainty in your classification, Bayesian approaches like this can be a useful approach.\n", @@ -260,7 +315,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Multinomial Naive Bayes\n", "\n", @@ -273,7 +331,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Example: Classifying Text\n", "\n", @@ -287,7 +348,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -329,7 +392,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For simplicity here, we will select just a few of these categories, and download the training and testing set:" ] @@ -338,7 +404,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -350,7 +418,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here is a representative entry from the data:" ] @@ -359,7 +430,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -393,7 +466,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In order to use this data for machine learning, we need to be able to convert the content of each string into a vector of numbers.\n", "For this we will use the TF-IDF vectorizer (discussed in [Feature Engineering](05.04-Feature-Engineering.ipynb)), and create a pipeline that attaches it to a multinomial naive Bayes classifier:" @@ -403,7 +479,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -416,7 +494,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With this pipeline, we can apply the model to the training data, and predict labels for the test data:" ] @@ -425,7 +506,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -435,7 +518,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now that we have predicted the labels for the test data, we can evaluate them to learn about the performance of the estimator.\n", "For example, here is the confusion matrix between the true and predicted labels for the test data:" @@ -445,7 +531,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -470,7 +558,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Evidently, even this very simple classifier can successfully separate space talk from computer talk, but it gets confused between talk about religion and talk about Christianity.\n", "This is perhaps an expected area of confusion!\n", @@ -483,7 +574,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -494,7 +587,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's try it out:" ] @@ -503,7 +599,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -525,7 +623,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -547,7 +647,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -567,7 +669,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Remember that this is nothing more sophisticated than a simple probability model for the (weighted) frequency of each word in the string; nevertheless, the result is striking.\n", "Even a very naive algorithm, when used carefully and trained on a large set of high-dimensional data, can be surprisingly effective." @@ -575,7 +680,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## When to Use Naive Bayes\n", "\n", @@ -604,10 +712,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Feature Engineering](05.04-Feature-Engineering.ipynb) | [Contents](Index.ipynb) | [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) >" + "< [Feature Engineering](05.04-Feature-Engineering.ipynb) | [Contents](Index.ipynb) | [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.06-Linear-Regression.ipynb b/notebooks/05.06-Linear-Regression.ipynb index 892a7a44b..ccecf6292 100644 --- a/notebooks/05.06-Linear-Regression.ipynb +++ b/notebooks/05.06-Linear-Regression.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) | [Contents](Index.ipynb) | [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) >" + "< [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) | [Contents](Index.ipynb) | [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# In Depth: Linear Regression\n", - "\n", + "# In Depth: Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "Just as naive Bayes (discussed earlier in [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb)) is a good starting point for classification tasks, linear regression models are a good starting point for regression tasks.\n", "Such models are popular because they can be fit very quickly, and are very interpretable.\n", "You are probably familiar with the simplest form of a linear regression model (i.e., fitting a straight line to data) but such models can be extended to model more complicated data behavior.\n", @@ -38,7 +55,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -50,7 +69,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Simple Linear Regression\n", "\n", @@ -68,7 +90,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -91,7 +115,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We can use Scikit-Learn's ``LinearRegression`` estimator to fit this data and construct the best-fit line:" ] @@ -100,7 +127,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -129,7 +158,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The slope and intercept of the data are contained in the model's fit parameters, which in Scikit-Learn are always marked by a trailing underscore.\n", "Here the relevant parameters are ``coef_`` and ``intercept_``:" @@ -139,7 +171,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -158,14 +192,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see that the results are very close to the inputs, as we might hope." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The ``LinearRegression`` estimator is much more capable than this, however—in addition to simple straight-line fits, it can also handle multidimensional linear models of the form\n", "$$\n", @@ -181,7 +221,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -205,7 +247,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here the $y$ data is constructed from three random $x$ values, and the linear regression recovers the coefficients used to construct the data.\n", "\n", @@ -215,7 +260,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Basis Function Regression\n", "\n", @@ -238,7 +286,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Polynomial basis functions\n", "\n", @@ -249,7 +300,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -274,7 +327,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see here that the transformer has converted our one-dimensional array into a three-dimensional array by taking the exponent of each value.\n", "This new, higher-dimensional data representation can then be plugged into a linear regression.\n", @@ -287,7 +343,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -298,7 +356,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With this transform in place, we can use the linear model to fit much more complicated relationships between $x$ and $y$. \n", "For example, here is a sine wave with noise:" @@ -308,7 +369,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -336,14 +399,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Our linear model, through the use of 7th-order polynomial basis functions, can provide an excellent fit to this non-linear data!" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Gaussian basis functions\n", "\n", @@ -354,7 +423,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.06-gaussian-basis.png)\n", "[figure source in Appendix](#Gaussian-Basis)" @@ -362,7 +434,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The shaded regions in the plot are the scaled basis functions, and when added together they reproduce the smooth curve through the data.\n", "These Gaussian basis functions are not built into Scikit-Learn, but we can write a custom transformer that will create them, as shown here and illustrated in the following figure (Scikit-Learn transformers are implemented as Python classes; reading Scikit-Learn's source is a good way to see how they can be created):" @@ -372,7 +447,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -423,14 +500,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We put this example here just to make clear that there is nothing magic about polynomial basis functions: if you have some sort of intuition into the generating process of your data that makes you think one basis or another might be appropriate, you can use them as well." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Regularization\n", "\n", @@ -442,7 +525,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -470,7 +555,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With the data projected to the 30-dimensional basis, the model has far too much flexibility and goes to extreme values between locations where it is constrained by data.\n", "We can see the reason for this if we plot the coefficients of the Gaussian bases with respect to their locations:" @@ -480,7 +568,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -517,7 +607,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The lower panel of this figure shows the amplitude of the basis function at each location.\n", "This is typical over-fitting behavior when basis functions overlap: the coefficients of adjacent basis functions blow up and cancel each other out.\n", @@ -527,7 +620,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Ridge regression ($L_2$ Regularization)\n", "\n", @@ -544,7 +640,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -566,7 +664,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The $\\alpha$ parameter is essentially a knob controlling the complexity of the resulting model.\n", "In the limit $\\alpha \\to 0$, we recover the standard linear regression result; in the limit $\\alpha \\to \\infty$, all model responses will be suppressed.\n", @@ -575,7 +676,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Lasso regression ($L_1$ regularization)\n", "\n", @@ -592,7 +696,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -614,7 +720,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With the lasso regression penalty, the majority of the coefficients are exactly zero, with the functional behavior being modeled by a small subset of the available basis functions.\n", "As with ridge regularization, the $\\alpha$ parameter tunes the strength of the penalty, and should be determined via, for example, cross-validation (refer back to [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) for a discussion of this)." @@ -622,7 +731,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Example: Predicting Bicycle Traffic" ] @@ -630,7 +742,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "As an example, let's take a look at whether we can predict the number of bicycle trips across Seattle's Fremont Bridge based on weather, season, and other factors.\n", @@ -650,7 +764,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -661,7 +777,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -672,7 +790,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next we will compute the total daily bicycle traffic, and put this in its own dataframe:" ] @@ -681,7 +802,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -692,7 +815,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We saw previously that the patterns of use generally vary from day to day; let's account for this in our data by adding binary columns that indicate the day of the week:" ] @@ -701,7 +827,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -712,7 +840,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Similarly, we might expect riders to behave differently on holidays; let's add an indicator of this as well:" ] @@ -721,7 +852,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -734,7 +867,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We also might suspect that the hours of daylight would affect how many people ride; let's use the standard astronomical calculation to add this information:" ] @@ -743,7 +879,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -782,7 +920,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We can also add the average temperature and total precipitation to the data.\n", "In addition to the inches of precipitation, let's add a flag that indicates whether a day is dry (has zero precipitation):" @@ -792,7 +933,9 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -810,7 +953,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, let's add a counter that increases from day 1, and measures how many years have passed.\n", "This will let us measure any observed annual increase or decrease in daily crossings:" @@ -820,7 +966,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -829,7 +977,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now our data is in order, and we can take a look at it:" ] @@ -838,7 +989,9 @@ "cell_type": "code", "execution_count": 22, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1001,7 +1154,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With this in place, we can choose the columns to use, and fit a linear regression model to our data.\n", "We will set ``fit_intercept = False``, because the daily flags essentially operate as their own day-specific intercepts:" @@ -1011,7 +1167,9 @@ "cell_type": "code", "execution_count": 23, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1030,7 +1188,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, we can compare the total and predicted bicycle traffic visually:" ] @@ -1039,7 +1200,9 @@ "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1059,7 +1222,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "It is evident that we have missed some key features, especially during the summer time.\n", "Either our features are not complete (i.e., people decide whether to ride to work based on more than just these) or there are some nonlinear relationships that we have failed to take into account (e.g., perhaps people ride less at both high and low temperatures).\n", @@ -1070,7 +1236,9 @@ "cell_type": "code", "execution_count": 25, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1104,7 +1272,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "These numbers are difficult to interpret without some measure of their uncertainty.\n", "We can compute these uncertainties quickly using bootstrap resamplings of the data:" @@ -1114,7 +1285,9 @@ "cell_type": "code", "execution_count": 26, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1126,7 +1299,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With these errors estimated, let's again look at the results:" ] @@ -1135,7 +1311,9 @@ "cell_type": "code", "execution_count": 27, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1166,7 +1344,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We first see that there is a relatively stable trend in the weekly baseline: there are many more riders on weekdays than on weekends and holidays.\n", "We see that for each additional hour of daylight, 129 ± 9 more people choose to ride; a temperature increase of one degree Celsius encourages 65 ± 4 people to grab their bicycle; a dry day means an average of 548 ± 33 more riders, and each inch of precipitation means 665 ± 62 more people leave their bike at home.\n", @@ -1179,10 +1360,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) | [Contents](Index.ipynb) | [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) >" + "< [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) | [Contents](Index.ipynb) | [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.07-Support-Vector-Machines.ipynb b/notebooks/05.07-Support-Vector-Machines.ipynb index d3ae901a9..31cf9508b 100644 --- a/notebooks/05.07-Support-Vector-Machines.ipynb +++ b/notebooks/05.07-Support-Vector-Machines.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) | [Contents](Index.ipynb) | [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb) >" + "< [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) | [Contents](Index.ipynb) | [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1010,7 +1013,9 @@ "metadata": {}, "source": [ "\n", - "< [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) | [Contents](Index.ipynb) | [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb) >" + "< [In Depth: Linear Regression](05.06-Linear-Regression.ipynb) | [Contents](Index.ipynb) | [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.08-Random-Forests.ipynb b/notebooks/05.08-Random-Forests.ipynb index 74da6a281..f567f238e 100644 --- a/notebooks/05.08-Random-Forests.ipynb +++ b/notebooks/05.08-Random-Forests.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) | [Contents](Index.ipynb) | [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) >" + "< [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) | [Contents](Index.ipynb) | [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -722,7 +725,9 @@ "metadata": {}, "source": [ "\n", - "< [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) | [Contents](Index.ipynb) | [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) >" + "< [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) | [Contents](Index.ipynb) | [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.09-Principal-Component-Analysis.ipynb b/notebooks/05.09-Principal-Component-Analysis.ipynb index 8bc761f73..065b1f4a7 100644 --- a/notebooks/05.09-Principal-Component-Analysis.ipynb +++ b/notebooks/05.09-Principal-Component-Analysis.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb) | [Contents](Index.ipynb) | [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb) >" + "< [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb) | [Contents](Index.ipynb) | [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# In Depth: Principal Component Analysis\n", - "\n", + "# In Depth: Principal Component Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "Up until now, we have been looking in depth at supervised learning estimators: those estimators that predict labels based on labeled training data.\n", "Here we begin looking at several unsupervised estimators, which can highlight interesting aspects of the data without reference to any known labels.\n", "\n", @@ -39,7 +56,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -51,7 +70,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Introducing Principal Component Analysis\n", "\n", @@ -64,7 +86,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -87,7 +111,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "By eye, it is clear that there is a nearly linear relationship between the x and y variables.\n", "This is reminiscent of the linear regression data we explored in [In Depth: Linear Regression](05.06-Linear-Regression.ipynb), but the problem setting here is slightly different: rather than attempting to *predict* the y values from the x values, the unsupervised learning problem attempts to learn about the *relationship* between the x and y values.\n", @@ -100,7 +127,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -122,7 +151,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The fit learns some quantities from the data, most importantly the \"components\" and \"explained variance\":" ] @@ -131,7 +163,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -151,7 +185,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -168,7 +204,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To see what these numbers mean, let's visualize them as vectors over the input data, using the \"components\" to define the direction of the vector, and the \"explained variance\" to define the squared-length of the vector:" ] @@ -177,7 +216,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -209,7 +250,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "These vectors represent the *principal axes* of the data, and the length of the vector is an indication of how \"important\" that axis is in describing the distribution of the data—more precisely, it is a measure of the variance of the data when projected onto that axis.\n", "The projection of each data point onto the principal axes are the \"principal components\" of the data.\n", @@ -219,7 +263,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![](figures/05.09-PCA-rotation.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Principal-Components-Rotation)" @@ -227,7 +274,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This transformation from data axes to principal axes is an *affine transformation*, which basically means it is composed of a translation, rotation, and uniform scaling.\n", "\n", @@ -236,7 +286,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### PCA as dimensionality reduction\n", "\n", @@ -249,7 +302,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -271,7 +326,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The transformed data has been reduced to a single dimension.\n", "To understand the effect of this dimensionality reduction, we can perform the inverse transform of this reduced data and plot it along with the original data:" @@ -281,7 +339,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -304,7 +364,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The light points are the original data, while the dark points are the projected version.\n", "This makes clear what a PCA dimensionality reduction means: the information along the least important principal axis or axes is removed, leaving only the component(s) of the data with the highest variance.\n", @@ -315,7 +378,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### PCA for visualization: Hand-written digits\n", "\n", @@ -329,7 +395,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -351,7 +419,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Recall that the data consists of 8×8 pixel images, meaning that they are 64-dimensional.\n", "To gain some intuition into the relationships between these points, we can use PCA to project them to a more manageable number of dimensions, say two:" @@ -361,7 +432,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -382,7 +455,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We can now plot the first two principal components of each point to learn about the data:" ] @@ -391,7 +467,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -416,7 +494,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Recall what these components mean: the full data is a 64-dimensional point cloud, and these points are the projection of each data point along the directions with the largest variance.\n", "Essentially, we have found the optimal stretch and rotation in 64-dimensional space that allows us to see the layout of the digits in two dimensions, and have done this in an unsupervised manner—that is, without reference to the labels." @@ -424,7 +505,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### What do the components mean?\n", "\n", @@ -450,7 +534,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "source": [ "![](figures/05.09-digits-pixel-components.png)\n", @@ -459,7 +545,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The upper row of panels shows the individual pixels, and the lower row shows the cumulative contribution of these pixels to the construction of the image.\n", "Using only eight of the pixel-basis components, we can only construct a small portion of the 64-pixel image.\n", @@ -468,7 +557,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "But the pixel-wise representation is not the only choice of basis. We can also use other basis functions, which each contain some pre-defined contribution from each pixel, and write something like\n", "\n", @@ -484,7 +576,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "source": [ "![](figures/05.09-digits-pca-components.png)\n", @@ -493,7 +587,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Unlike the pixel basis, the PCA basis allows us to recover the salient features of the input image with just a mean plus eight components!\n", "The amount of each pixel in each component is the corollary of the orientation of the vector in our two-dimensional example.\n", @@ -502,7 +599,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Choosing the number of components\n", "\n", @@ -514,7 +614,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -537,7 +639,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This curve quantifies how much of the total, 64-dimensional variance is contained within the first $N$ components.\n", "For example, we see that with the digits the first 10 components contain approximately 75% of the variance, while you need around 50 components to describe close to 100% of the variance.\n", @@ -547,7 +652,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## PCA as Noise Filtering\n", "\n", @@ -563,7 +671,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -591,7 +701,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now lets add some random noise to create a noisy dataset, and re-plot it:" ] @@ -600,7 +713,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -622,7 +737,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "It's clear by eye that the images are noisy, and contain spurious pixels.\n", "Let's train a PCA on the noisy data, requesting that the projection preserve 50% of the variance:" @@ -632,7 +750,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -653,7 +773,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here 50% of the variance amounts to 12 principal components.\n", "Now we compute these components, and then use the inverse of the transform to reconstruct the filtered digits:" @@ -663,7 +786,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -685,14 +810,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This signal preserving/noise filtering property makes PCA a very useful feature selection routine—for example, rather than training a classifier on very high-dimensional data, you might instead train the classifier on the lower-dimensional representation, which will automatically serve to filter out random noise in the inputs." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Example: Eigenfaces\n", "\n", @@ -705,7 +836,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -727,7 +860,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's take a look at the principal axes that span this dataset.\n", "Because this is a large dataset, we will use ``RandomizedPCA``—it contains a randomized method to approximate the first $N$ principal components much more quickly than the standard ``PCA`` estimator, and thus is very useful for high-dimensional data (here, a dimensionality of nearly 3,000).\n", @@ -738,7 +874,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -761,7 +899,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In this case, it can be interesting to visualize the images associated with the first several principal components (these components are technically known as \"eigenvectors,\"\n", "so these types of images are often called \"eigenfaces\").\n", @@ -772,7 +913,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -796,7 +939,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The results are very interesting, and give us insight into how the images vary: for example, the first few eigenfaces (from the top left) seem to be associated with the angle of lighting on the face, and later principal vectors seem to be picking out certain features, such as eyes, noses, and lips.\n", "Let's take a look at the cumulative variance of these components to see how much of the data information the projection is preserving:" @@ -806,7 +952,9 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -828,7 +976,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see that these 150 components account for just over 90% of the variance.\n", "That would lead us to believe that using these 150 components, we would recover most of the essential characteristics of the data.\n", @@ -839,7 +990,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -853,7 +1006,9 @@ "cell_type": "code", "execution_count": 22, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -882,7 +1037,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The top row here shows the input images, while the bottom row shows the reconstruction of the images from just 150 of the ~3,000 initial features.\n", "This visualization makes clear why the PCA feature selection used in [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) was so successful: although it reduces the dimensionality of the data by nearly a factor of 20, the projected images contain enough information that we might, by eye, recognize the individuals in the image.\n", @@ -891,7 +1049,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Principal Component Analysis Summary\n", "\n", @@ -910,10 +1071,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb) | [Contents](Index.ipynb) | [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb) >" + "< [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb) | [Contents](Index.ipynb) | [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.10-Manifold-Learning.ipynb b/notebooks/05.10-Manifold-Learning.ipynb index f1cc3f152..7ec547ba9 100644 --- a/notebooks/05.10-Manifold-Learning.ipynb +++ b/notebooks/05.10-Manifold-Learning.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) | [Contents](Index.ipynb) | [In Depth: k-Means Clustering](05.11-K-Means.ipynb) >" + "< [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) | [Contents](Index.ipynb) | [In Depth: k-Means Clustering](05.11-K-Means.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -1029,7 +1032,9 @@ "metadata": {}, "source": [ "\n", - "< [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) | [Contents](Index.ipynb) | [In Depth: k-Means Clustering](05.11-K-Means.ipynb) >" + "< [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) | [Contents](Index.ipynb) | [In Depth: k-Means Clustering](05.11-K-Means.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.11-K-Means.ipynb b/notebooks/05.11-K-Means.ipynb index 981f9b042..8907d80cf 100644 --- a/notebooks/05.11-K-Means.ipynb +++ b/notebooks/05.11-K-Means.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb) | [Contents](Index.ipynb) | [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb) >" + "< [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb) | [Contents](Index.ipynb) | [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -973,7 +976,9 @@ "metadata": {}, "source": [ "\n", - "< [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb) | [Contents](Index.ipynb) | [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb) >" + "< [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb) | [Contents](Index.ipynb) | [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.12-Gaussian-Mixtures.ipynb b/notebooks/05.12-Gaussian-Mixtures.ipynb index bd660315d..f5c4d7358 100644 --- a/notebooks/05.12-Gaussian-Mixtures.ipynb +++ b/notebooks/05.12-Gaussian-Mixtures.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [In Depth: k-Means Clustering](05.11-K-Means.ipynb) | [Contents](Index.ipynb) | [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) >" + "< [In Depth: k-Means Clustering](05.11-K-Means.ipynb) | [Contents](Index.ipynb) | [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# In Depth: Gaussian Mixture Models\n", - "\n", + "# In Depth: Gaussian Mixture Models" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "The *k*-means clustering model explored in the previous section is simple and relatively easy to understand, but its simplicity leads to practical challenges in its application.\n", "In particular, the non-probabilistic nature of *k*-means and its use of simple distance-from-cluster-center to assign cluster membership leads to poor performance for many real-world situations.\n", "In this section we will take a look at Gaussian mixture models (GMMs), which can be viewed as an extension of the ideas behind *k*-means, but can also be a powerful tool for estimation beyond simple clustering.\n", @@ -36,7 +53,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -48,7 +67,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Motivating GMM: Weaknesses of k-Means\n", "\n", @@ -62,7 +84,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -77,7 +101,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -101,7 +127,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "From an intuitive standpoint, we might expect that the clustering assignment for some points is more certain than others: for example, there appears to be a very slight overlap between the two middle clusters, such that we might not have complete confidence in the cluster assigment of points between them.\n", "Unfortunately, the *k*-means model has no intrinsic measure of probability or uncertainty of cluster assignments (although it may be possible to use a bootstrap approach to estimate this uncertainty).\n", @@ -116,7 +145,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -143,7 +174,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -164,7 +197,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "An important observation for *k*-means is that these cluster models *must be circular*: *k*-means has no built-in way of accounting for oblong or elliptical clusters.\n", "So, for example, if we take the same data and transform it, the cluster assignments end up becoming muddled:" @@ -174,7 +210,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -198,7 +236,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "By eye, we recognize that these transformed clusters are non-circular, and thus circular clusters would be a poor fit.\n", "Nevertheless, *k*-means is not flexible enough to account for this, and tries to force-fit the data into four circular clusters.\n", @@ -214,7 +255,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Generalizing E–M: Gaussian Mixture Models\n", "\n", @@ -226,7 +270,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -249,7 +295,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "But because GMM contains a probabilistic model under the hood, it is also possible to find probabilistic cluster assignments—in Scikit-Learn this is done using the ``predict_proba`` method.\n", "This returns a matrix of size ``[n_samples, n_clusters]`` which measures the probability that any point belongs to the given cluster:" @@ -259,7 +308,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -281,7 +332,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We can visualize this uncertainty by, for example, making the size of each point proportional to the certainty of its prediction; looking at the following figure, we can see that it is precisely the points at the boundaries between clusters that reflect this uncertainty of cluster assignment:" ] @@ -290,7 +344,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -311,7 +367,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Under the hood, a Gaussian mixture model is very similar to *k*-means: it uses an expectation–maximization approach which qualitatively does the following:\n", "\n", @@ -332,7 +391,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -372,7 +433,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With this in place, we can take a look at what the four-component GMM gives us for our initial data:" ] @@ -381,7 +445,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -402,7 +468,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Similarly, we can use the GMM approach to fit our stretched dataset; allowing for a full covariance the model will fit even very oblong, stretched-out clusters:" ] @@ -411,7 +480,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -432,14 +503,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This makes clear that GMM addresses the two main practical issues with *k*-means encountered before." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Choosing the covariance type\n", "\n", @@ -454,7 +531,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "![(Covariance Type)](figures/05.12-covariance-type.png)\n", "[figure source in Appendix](06.00-Figure-Code.ipynb#Covariance-Type)" @@ -462,7 +542,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## GMM as *Density Estimation*\n", "\n", @@ -476,7 +559,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -498,7 +583,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "If we try to fit this with a two-component GMM viewed as a clustering model, the results are not particularly useful:" ] @@ -507,7 +595,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -528,7 +618,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "But if we instead use many more components and ignore the cluster labels, we find a fit that is much closer to the input data:" ] @@ -537,7 +630,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -558,7 +653,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here the mixture of 16 Gaussians serves not to find separated clusters of data, but rather to model the overall *distribution* of the input data.\n", "This is a generative model of the distribution, meaning that the GMM gives us the recipe to generate new random data distributed similarly to our input.\n", @@ -569,7 +667,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -590,14 +690,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "GMM is convenient as a flexible means of modeling an arbitrary multi-dimensional distribution of data." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### How many components?\n", "\n", @@ -613,7 +719,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -640,7 +748,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The optimal number of clusters is the value that minimizes the AIC or BIC, depending on which approximation we wish to use. The AIC tells us that our choice of 16 components above was probably too many: around 8-12 components would have been a better choice.\n", "As is typical with this sort of problem, the BIC recommends a simpler model.\n", @@ -651,7 +762,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Example: GMM for Generating New Data\n", "\n", @@ -665,7 +779,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -687,7 +803,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next let's plot the first 100 of these to recall exactly what we're looking at:" ] @@ -696,7 +815,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -723,7 +844,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We have nearly 1,800 digits in 64 dimensions, and we can build a GMM on top of these to generate more.\n", "GMMs can have difficulty converging in such a high dimensional space, so we will start with an invertible dimensionality reduction algorithm on the data.\n", @@ -734,7 +858,9 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -757,7 +883,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The result is 41 dimensions, a reduction of nearly 1/3 with almost no information loss.\n", "Given this projected data, let's use the AIC to get a gauge for the number of GMM components we should use:" @@ -767,7 +896,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -791,7 +922,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "It appears that around 110 components minimizes the AIC; we will use this model.\n", "Let's quickly fit this to the data and confirm that it has converged:" @@ -801,7 +935,9 @@ "cell_type": "code", "execution_count": 22, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -820,7 +956,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now we can draw samples of 100 new points within this 41-dimensional projected space, using the GMM as a generative model:" ] @@ -829,7 +968,9 @@ "cell_type": "code", "execution_count": 23, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -850,7 +991,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, we can use the inverse transform of the PCA object to construct the new digits:" ] @@ -859,7 +1003,9 @@ "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -880,7 +1026,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The results for the most part look like plausible digits from the dataset!\n", "\n", @@ -890,10 +1039,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [In Depth: k-Means Clustering](05.11-K-Means.ipynb) | [Contents](Index.ipynb) | [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) >" + "< [In Depth: k-Means Clustering](05.11-K-Means.ipynb) | [Contents](Index.ipynb) | [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.13-Kernel-Density-Estimation.ipynb b/notebooks/05.13-Kernel-Density-Estimation.ipynb index 1336ac0ec..5ddf8b59a 100644 --- a/notebooks/05.13-Kernel-Density-Estimation.ipynb +++ b/notebooks/05.13-Kernel-Density-Estimation.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,20 +17,32 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb) | [Contents](Index.ipynb) | [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb) >" + "< [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb) | [Contents](Index.ipynb) | [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb) >\n", + "\n", + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In-Depth: Kernel Density Estimation" ] }, { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ - "# In-Depth: Kernel Density Estimation\n", - "\n", "In the previous section we covered Gaussian mixture models (GMM), which are a kind of hybrid between a clustering estimator and a density estimator.\n", "Recall that a density estimator is an algorithm which takes a $D$-dimensional dataset and produces an estimate of the $D$-dimensional probability distribution which that data is drawn from.\n", "The GMM algorithm accomplishes this by representing the density as a weighted sum of Gaussian distributions.\n", @@ -40,7 +56,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -52,7 +70,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Motivating KDE: Histograms\n", "\n", @@ -67,7 +88,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -82,7 +105,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We have previously seen that the standard count-based histogram can be created with the ``plt.hist()`` function.\n", "By specifying the ``normed`` parameter of the histogram, we end up with a normalized histogram where the height of the bins does not reflect counts, but instead reflects probability density:" @@ -92,7 +118,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -112,7 +140,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice that for equal binning, this normalization simply changes the scale on the y-axis, leaving the relative heights essentially the same as in a histogram built from counts.\n", "This normalization is chosen so that the total area under the histogram is equal to 1, as we can confirm by looking at the output of the histogram function:" @@ -122,7 +153,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -144,7 +177,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "One of the issues with using a histogram as a density estimator is that the choice of bin size and location can lead to representations that have qualitatively different features.\n", "For example, if we look at a version of this data with only 20 points, the choice of how to draw the bins can lead to an entirely different interpretation of the data!\n", @@ -155,7 +191,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -167,7 +205,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -195,7 +235,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "On the left, the histogram makes clear that this is a bimodal distribution.\n", "On the right, we see a unimodal distribution with a long tail.\n", @@ -210,7 +253,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -249,7 +294,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The problem with our two binnings stems from the fact that the height of the block stack often reflects not on the actual density of points nearby, but on coincidences of how the bins align with the data points.\n", "This mis-alignment between points and their blocks is a potential cause of the poor histogram results seen here.\n", @@ -262,7 +310,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -288,7 +338,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The result looks a bit messy, but is a much more robust reflection of the actual data characteristics than is the standard histogram.\n", "Still, the rough edges are not aesthetically pleasing, nor are they reflective of any true properties of the data.\n", @@ -300,7 +353,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -327,7 +382,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This smoothed-out plot, with a Gaussian distribution contributed at the location of each input point, gives a much more accurate idea of the shape of the data distribution, and one which has much less variance (i.e., changes much less in response to differences in sampling).\n", "\n", @@ -337,7 +395,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Kernel Density Estimation in Practice\n", "\n", @@ -356,7 +417,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -397,14 +460,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The result here is normalized such that the area under the curve is equal to 1." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Selecting the bandwidth via cross-validation\n", "\n", @@ -422,7 +491,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -438,7 +509,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now we can find the choice of bandwidth which maximizes the score (which in this case defaults to the log-likelihood):" ] @@ -447,7 +521,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -467,14 +543,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The optimal bandwidth happens to be very close to what we used in the example plot earlier, where the bandwidth was 1.0 (i.e., the default width of ``scipy.stats.norm``)." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Example: KDE on a Sphere\n", "\n", @@ -491,7 +573,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -508,7 +592,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With this data loaded, we can use the Basemap toolkit (mentioned previously in [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb)) to plot the observed locations of these two species on the map of South America." ] @@ -517,7 +604,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -553,7 +642,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Unfortunately, this doesn't give a very good idea of the density of the species, because points in the species range may overlap one another.\n", "You may not realize it by looking at this plot, but there are over 1,600 points shown here!\n", @@ -568,7 +660,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -623,14 +717,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Compared to the simple scatter plot we initially used, this visualization paints a much clearer picture of the geographical distribution of observations of these two species." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Example: Not-So-Naive Bayes\n", "\n", @@ -662,7 +762,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -705,14 +807,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### The anatomy of a custom estimator" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's step through this code and discuss the essential features:\n", "\n", @@ -738,7 +846,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next comes the class initialization method:\n", "\n", @@ -756,7 +867,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next comes the ``fit()`` method, where we handle training data:\n", "\n", @@ -783,7 +897,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, we have the logic for predicting labels on new data:\n", "```python\n", @@ -804,7 +921,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Using our custom estimator\n", "\n", @@ -816,7 +936,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -834,7 +956,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next we can plot the cross-validation score as a function of bandwidth:" ] @@ -843,7 +968,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -876,7 +1003,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We see that this not-so-naive Bayesian classifier reaches a cross-validation accuracy of just over 96%; this is compared to around 80% for the naive Bayesian classification:" ] @@ -885,7 +1015,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -907,7 +1039,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "One benefit of such a generative classifier is interpretability of results: for each unknown sample, we not only get a probabilistic classification, but a *full model* of the distribution of points we are comparing it to!\n", "If desired, this offers an intuitive window into the reasons for a particular classification that algorithms like SVMs and random forests tend to obscure.\n", @@ -922,10 +1057,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb) | [Contents](Index.ipynb) | [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb) >" + "< [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb) | [Contents](Index.ipynb) | [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.14-Image-Features.ipynb b/notebooks/05.14-Image-Features.ipynb index b745b3079..47ddff7da 100644 --- a/notebooks/05.14-Image-Features.ipynb +++ b/notebooks/05.14-Image-Features.ipynb @@ -6,6 +6,7 @@ "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -16,7 +17,9 @@ "metadata": {}, "source": [ "\n", - "< [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) | [Contents](Index.ipynb) | [Further Machine Learning Resources](05.15-Learning-More.ipynb) >" + "< [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) | [Contents](Index.ipynb) | [Further Machine Learning Resources](05.15-Learning-More.ipynb) >\n", + "\n", + "\"Open\n" ] }, { @@ -662,7 +665,9 @@ "metadata": {}, "source": [ "\n", - "< [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) | [Contents](Index.ipynb) | [Further Machine Learning Resources](05.15-Learning-More.ipynb) >" + "< [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb) | [Contents](Index.ipynb) | [Further Machine Learning Resources](05.15-Learning-More.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/05.15-Learning-More.ipynb b/notebooks/05.15-Learning-More.ipynb index 9bac85ebc..17d8cc77c 100644 --- a/notebooks/05.15-Learning-More.ipynb +++ b/notebooks/05.15-Learning-More.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb) | [Contents](Index.ipynb) | [Appendix: Figure Code](06.00-Figure-Code.ipynb) >" + "< [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb) | [Contents](Index.ipynb) | [Appendix: Figure Code](06.00-Figure-Code.ipynb) >\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Further Machine Learning Resources\n", - "\n", + "# Further Machine Learning Resources" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "This chapter has been a quick tour of machine learning in Python, primarily using the tools within the Scikit-Learn library.\n", "As long as the chapter is, it is still too short to cover many interesting and important algorithms, approaches, and discussions.\n", "Here I want to suggest some resources to learn more about machine learning for those who are interested." @@ -32,7 +49,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Machine Learning in Python\n", "\n", @@ -49,7 +69,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## General Machine Learning\n", "\n", @@ -67,10 +90,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb) | [Contents](Index.ipynb) | [Appendix: Figure Code](06.00-Figure-Code.ipynb) >" + "< [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb) | [Contents](Index.ipynb) | [Appendix: Figure Code](06.00-Figure-Code.ipynb) >\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/06.00-Figure-Code.ipynb b/notebooks/06.00-Figure-Code.ipynb index cfefaf4f5..73940a1c1 100644 --- a/notebooks/06.00-Figure-Code.ipynb +++ b/notebooks/06.00-Figure-Code.ipynb @@ -2,10 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -13,18 +17,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Further Machine Learning Resources](05.15-Learning-More.ipynb) | [Contents](Index.ipynb) |" + "< [Further Machine Learning Resources](05.15-Learning-More.ipynb) | [Contents](Index.ipynb) |\n", + "\n", + "\"Open\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Appendix: Figure Code\n", - "\n", + "# Appendix: Figure Code" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ "Many of the figures used throughout this text are created in-place by code that appears in print.\n", "In a few cases, however, the required code is long enough (or not immediately relevant enough) that we instead put it here for reference." ] @@ -33,7 +50,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -47,7 +66,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -58,7 +79,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Broadcasting\n", "\n", @@ -69,7 +93,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -275,7 +301,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Aggregation and Grouping\n", "\n", @@ -284,7 +313,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Split-Apply-Combine" ] @@ -293,7 +325,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -418,7 +452,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## What Is Machine Learning?" ] @@ -427,7 +464,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -442,7 +481,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Classification Example Figures\n", "\n", @@ -455,7 +497,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -481,7 +525,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Classification Example Figure 1" ] @@ -490,7 +537,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -519,7 +568,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Classification Example Figure 2" ] @@ -528,7 +580,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -567,7 +621,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Classification Example Figure 3" ] @@ -576,7 +633,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -610,7 +669,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Regression Example Figures\n", "\n", @@ -623,7 +685,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -648,7 +712,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Regression Example Figure 1" ] @@ -657,7 +724,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -686,7 +755,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Regression Example Figure 2" ] @@ -695,7 +767,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -750,7 +824,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Regression Example Figure 3" ] @@ -759,7 +836,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -799,7 +878,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Regression Example Figure 4" ] @@ -808,7 +890,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -843,7 +927,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Clustering Example Figures\n", "\n", @@ -856,7 +943,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -874,7 +963,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Clustering Example Figure 1" ] @@ -883,7 +975,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -910,7 +1004,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Clustering Example Figure 2" ] @@ -919,7 +1016,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -946,7 +1045,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Dimensionality Reduction Example Figures\n", "\n", @@ -957,7 +1059,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Dimensionality Reduction Example Figure 1" ] @@ -966,7 +1071,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -999,7 +1106,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Dimensionality Reduction Example Figure 2" ] @@ -1008,7 +1118,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1043,14 +1155,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Introducing Scikit-Learn" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Features and Labels Grid\n", "\n", @@ -1061,7 +1179,9 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1104,14 +1224,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Hyperparameters and Model Validation" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Cross-Validation Figures" ] @@ -1120,7 +1246,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1138,7 +1266,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### 2-Fold Cross-Validation" ] @@ -1147,7 +1278,9 @@ "cell_type": "code", "execution_count": 22, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1172,7 +1305,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### 5-Fold Cross-Validation" ] @@ -1181,7 +1317,9 @@ "cell_type": "code", "execution_count": 23, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1206,7 +1344,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Overfitting and Underfitting" ] @@ -1215,7 +1356,9 @@ "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1235,7 +1378,9 @@ "cell_type": "code", "execution_count": 25, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1250,7 +1395,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Bias-Variance Tradeoff" ] @@ -1259,7 +1407,9 @@ "cell_type": "code", "execution_count": 26, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1297,7 +1447,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Bias-Variance Tradeoff Metrics" ] @@ -1306,7 +1459,9 @@ "cell_type": "code", "execution_count": 27, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1351,7 +1506,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Validation Curve" ] @@ -1360,7 +1518,9 @@ "cell_type": "code", "execution_count": 28, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1406,7 +1566,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Learning Curve" ] @@ -1415,7 +1578,9 @@ "cell_type": "code", "execution_count": 29, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1460,7 +1625,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Gaussian Naive Bayes\n", "\n", @@ -1473,7 +1641,9 @@ "cell_type": "code", "execution_count": 30, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1522,7 +1692,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Linear Regression\n", "\n", @@ -1535,7 +1708,9 @@ "cell_type": "code", "execution_count": 31, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1610,7 +1785,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "## Random Forests" @@ -1618,7 +1795,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Helper Code\n", "\n", @@ -1629,7 +1809,9 @@ "cell_type": "code", "execution_count": 32, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1729,7 +1911,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Decision Tree Example" ] @@ -1738,7 +1923,9 @@ "cell_type": "code", "execution_count": 33, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1793,7 +1980,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Decision Tree Levels" ] @@ -1802,7 +1992,9 @@ "cell_type": "code", "execution_count": 34, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1838,7 +2030,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Decision Tree Overfitting" ] @@ -1847,7 +2042,9 @@ "cell_type": "code", "execution_count": 35, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1874,14 +2071,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Principal Component Analysis" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Principal Components Rotation" ] @@ -1890,7 +2093,9 @@ "cell_type": "code", "execution_count": 36, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1901,7 +2106,9 @@ "cell_type": "code", "execution_count": 37, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1917,7 +2124,9 @@ "cell_type": "code", "execution_count": 38, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1963,7 +2172,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Digits Pixel Components" ] @@ -1972,7 +2184,9 @@ "cell_type": "code", "execution_count": 39, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -2024,7 +2238,9 @@ "cell_type": "code", "execution_count": 40, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2052,7 +2268,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Digits PCA Components" ] @@ -2061,7 +2280,9 @@ "cell_type": "code", "execution_count": 41, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2087,14 +2308,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Manifold Learning" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### LLE vs MDS Linkages" ] @@ -2103,7 +2330,9 @@ "cell_type": "code", "execution_count": 42, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -2133,7 +2362,9 @@ "cell_type": "code", "execution_count": 43, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -2153,7 +2384,9 @@ "cell_type": "code", "execution_count": 44, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2199,7 +2432,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "## K-Means" @@ -2208,7 +2443,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "### Expectation-Maximization\n", @@ -2222,7 +2459,9 @@ "cell_type": "code", "execution_count": 45, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2310,7 +2549,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Interactive K-Means\n", "\n", @@ -2322,7 +2564,9 @@ "cell_type": "code", "execution_count": 46, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2409,14 +2653,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Gaussian Mixture Models" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Covariance Type\n", "\n", @@ -2427,7 +2677,9 @@ "cell_type": "code", "execution_count": 47, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -2486,10 +2738,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "\n", - "< [Further Machine Learning Resources](05.15-Learning-More.ipynb) | [Contents](Index.ipynb) |" + "< [Further Machine Learning Resources](05.15-Learning-More.ipynb) | [Contents](Index.ipynb) |\n", + "\n", + "\"Open\n" ] } ], diff --git a/notebooks/Index.ipynb b/notebooks/Index.ipynb index 32d26c71d..a368faa84 100644 --- a/notebooks/Index.ipynb +++ b/notebooks/Index.ipynb @@ -4,8 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Python Data Science Handbook\n", - "\n", + "# Python Data Science Handbook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "*Jake VanderPlas*\n", "\n", "![Book Cover](figures/PDSH-cover.png)" diff --git a/requirements.txt b/requirements.txt index cf1575e30..fe9cdd918 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,5 +12,4 @@ line_profiler memory_profiler numexpr pandas-datareader -basemap netcdf4 diff --git a/tools/add_book_info.py b/tools/add_book_info.py index 9788b9d21..ae9ca2be5 100644 --- a/tools/add_book_info.py +++ b/tools/add_book_info.py @@ -11,6 +11,7 @@ BOOK_INFO = BOOK_COMMENT + """ + *This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).* *The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*""" diff --git a/tools/add_navigation.py b/tools/add_navigation.py index febbd31ea..0fec360d0 100644 --- a/tools/add_navigation.py +++ b/tools/add_navigation.py @@ -19,6 +19,11 @@ def prev_this_next(it): NEXT_TEMPLATE = " [{title}]({url}) >" NAV_COMMENT = "\n" +COLAB_LINK = """ + +Open in Colab +""" + def iter_navbars(): for prev_nb, nb, next_nb in prev_this_next(iter_notebooks()): @@ -30,6 +35,9 @@ def iter_navbars(): if next_nb: navbar += NEXT_TEMPLATE.format(title=get_notebook_title(next_nb), url=next_nb) + + navbar += COLAB_LINK.format(notebook_filename=os.path.basename(nb)) + yield os.path.join(NOTEBOOK_DIR, nb), navbar diff --git a/website/.gitignore b/website/.gitignore new file mode 100644 index 000000000..1af125fab --- /dev/null +++ b/website/.gitignore @@ -0,0 +1,4 @@ +content/pages/*.md +output +content/figures +content/notebooks \ No newline at end of file diff --git a/website/Makefile b/website/Makefile new file mode 100644 index 000000000..b419e74ea --- /dev/null +++ b/website/Makefile @@ -0,0 +1,132 @@ +PY?=python3 +PELICAN?=pelican +PELICANOPTS= + +BASEDIR=$(CURDIR) +INPUTDIR=$(BASEDIR)/content +OUTPUTDIR=$(BASEDIR)/output +CONFFILE=$(BASEDIR)/pelicanconf.py +PUBLISHCONF=$(BASEDIR)/publishconf.py + +FTP_HOST=localhost +FTP_USER=anonymous +FTP_TARGET_DIR=/ + +SSH_HOST=localhost +SSH_PORT=22 +SSH_USER=root +SSH_TARGET_DIR=/var/www + +S3_BUCKET=my_s3_bucket + +CLOUDFILES_USERNAME=my_rackspace_username +CLOUDFILES_API_KEY=my_rackspace_api_key +CLOUDFILES_CONTAINER=my_cloudfiles_container + +DROPBOX_DIR=~/Dropbox/Public/ + +GITHUB_PAGES_REMOTE=git@github.com:jakevdp/PythonDataScienceHandbook.git +GITHUB_PAGES_BRANCH=gh-pages + +GIT_COMMIT_HASH = $(shell git rev-parse HEAD) + +DEBUG ?= 0 +ifeq ($(DEBUG), 1) + PELICANOPTS += -D +endif + +RELATIVE ?= 0 +ifeq ($(RELATIVE), 1) + PELICANOPTS += --relative-urls +endif + + +help: + @echo 'Makefile for a pelican Web site ' + @echo ' ' + @echo 'Usage: ' + @echo ' make html (re)generate the web site ' + @echo ' make clean remove the generated files ' + @echo ' make regenerate regenerate files upon modification ' + @echo ' make publish generate using production settings ' + @echo ' make serve [PORT=8000] serve site at http://localhost:8000' + @echo ' make serve-global [SERVER=0.0.0.0] serve (as root) to $(SERVER):80 ' + @echo ' make devserver [PORT=8000] start/restart develop_server.sh ' + @echo ' make stopserver stop local server ' + @echo ' make ssh_upload upload the web site via SSH ' + @echo ' make rsync_upload upload the web site via rsync+ssh ' + @echo ' make dropbox_upload upload the web site via Dropbox ' + @echo ' make ftp_upload upload the web site via FTP ' + @echo ' make s3_upload upload the web site via S3 ' + @echo ' make cf_upload upload the web site via Cloud Files' + @echo ' make github upload the web site via gh-pages ' + @echo ' ' + @echo 'Set the DEBUG variable to 1 to enable debugging, e.g. make DEBUG=1 html ' + @echo 'Set the RELATIVE variable to 1 to enable relative urls ' + @echo ' ' + +html: + $(PELICAN) $(INPUTDIR) -o $(OUTPUTDIR) -s $(CONFFILE) $(PELICANOPTS) + +clean: + [ ! -d $(OUTPUTDIR) ] || rm -rf $(OUTPUTDIR) + +regenerate: + $(PELICAN) -r $(INPUTDIR) -o $(OUTPUTDIR) -s $(CONFFILE) $(PELICANOPTS) + +serve: +ifdef PORT + cd $(OUTPUTDIR) && $(PY) -m pelican.server $(PORT) +else + cd $(OUTPUTDIR) && $(PY) -m pelican.server +endif + +serve-global: +ifdef SERVER + cd $(OUTPUTDIR) && $(PY) -m pelican.server 80 $(SERVER) +else + cd $(OUTPUTDIR) && $(PY) -m pelican.server 80 0.0.0.0 +endif + + +devserver: +ifdef PORT + $(BASEDIR)/develop_server.sh restart $(PORT) +else + $(BASEDIR)/develop_server.sh restart +endif + +stopserver: + $(BASEDIR)/develop_server.sh stop + @echo 'Stopped Pelican and SimpleHTTPServer processes running in background.' + +publish: + $(PELICAN) $(INPUTDIR) -o $(OUTPUTDIR) -s $(PUBLISHCONF) $(PELICANOPTS) + +ssh_upload: publish + scp -P $(SSH_PORT) -r $(OUTPUTDIR)/* $(SSH_USER)@$(SSH_HOST):$(SSH_TARGET_DIR) + +rsync_upload: publish + rsync -e "ssh -p $(SSH_PORT)" -P -rvzc --delete $(OUTPUTDIR)/ $(SSH_USER)@$(SSH_HOST):$(SSH_TARGET_DIR) --cvs-exclude + +dropbox_upload: publish + cp -r $(OUTPUTDIR)/* $(DROPBOX_DIR) + +ftp_upload: publish + lftp ftp://$(FTP_USER)@$(FTP_HOST) -e "mirror -R $(OUTPUTDIR) $(FTP_TARGET_DIR) ; quit" + +s3_upload: publish + s3cmd sync $(OUTPUTDIR)/ s3://$(S3_BUCKET) --acl-public --delete-removed --guess-mime-type --no-mime-magic --no-preserve + +cf_upload: publish + cd $(OUTPUTDIR) && swift -v -A https://auth.api.rackspacecloud.com/v1.0 -U $(CLOUDFILES_USERNAME) -K $(CLOUDFILES_API_KEY) upload -c $(CLOUDFILES_CONTAINER) . + +publish-to-github: publish + ghp-import -n -m "publish-to-github from $(GIT_COMMIT_HASH)" -b blog-build $(OUTPUTDIR) + git push $(GITHUB_PAGES_REMOTE) blog-build:$(GITHUB_PAGES_BRANCH) + +publish-to-github-force: publish + ghp-import -n -m "publish-to-github-force from $(GIT_COMMIT_HASH)" -b blog-build $(OUTPUTDIR) + git push -f $(GITHUB_PAGES_REMOTE) blog-build:$(GITHUB_PAGES_BRANCH) + +.PHONY: html help clean regenerate serve serve-global devserver stopserver publish ssh_upload rsync_upload dropbox_upload ftp_upload s3_upload cf_upload github diff --git a/website/README.md b/website/README.md new file mode 100644 index 000000000..d7a5e45eb --- /dev/null +++ b/website/README.md @@ -0,0 +1,45 @@ +2# Tools for creating http://jakevdp.github.io/PythonDataScienceHandbook/ + +The website is generated using the [Pelican](http://docs.getpelican.com/) static site generator. +The themes here are adapted from those used for my blog: https://github.com/jakevdp/jakevdp.github.io-source + +## Building the Website + +Clone the repository & make sure submodules are included + +``` +$ git clone https://github.com/jakevdp/PythonDataScienceHandbook.git +$ git checkout origin/website +$ git submodule update --init --recursive +$ cd website +``` + +Install the required packages: + +``` +$ conda create -n pelican-blog python=3.5 jupyter notebook +$ source activate pelican-blog +$ pip install pelican Markdown ghp-import +$ mkdir plugins +$ git submodule add git://github.com/danielfrg/pelican-ipynb.git plugins/ipynb +$ git submodule add https://github.com/getpelican/pelican-plugins.git plugins/pelican-plugins +``` + +Copy the notebook content to the right location (this script also modifies some links for the HTML): +``` +$ python copy_notebooks.py +``` + +Build the html and serve locally: + +``` +$ make html +$ make serve +$ open http://localhost:8000 +``` + +Deploy to github pages + +``` +$ make publish-to-github +``` diff --git a/website/content/favicon.ico b/website/content/favicon.ico new file mode 100644 index 000000000..91c0253f5 Binary files /dev/null and b/website/content/favicon.ico differ diff --git a/website/copy_notebooks.py b/website/copy_notebooks.py new file mode 100644 index 000000000..b60299847 --- /dev/null +++ b/website/copy_notebooks.py @@ -0,0 +1,111 @@ +""" +This script copies all notebooks from the book into the website directory, and +creates pages which wrap them and link together. +""" +import os +import nbformat +import shutil + +PAGEFILE = """title: {title} +url: +save_as: {htmlfile} +Template: {template} + +{{% notebook notebooks/{notebook_file} cells[{cells}] %}} +""" + +INTRO_TEXT = """This website contains the full text of the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook) in the form of Jupyter notebooks. + +The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). + +If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)! +""" + + +def abspath_from_here(*args): + here = os.path.dirname(__file__) + path = os.path.join(here, *args) + return os.path.abspath(path) + +NB_SOURCE_DIR = abspath_from_here('..', 'notebooks') +NB_DEST_DIR = abspath_from_here('content', 'notebooks') +PAGE_DEST_DIR = abspath_from_here('content', 'pages') + + +def copy_notebooks(): + if not os.path.exists(NB_DEST_DIR): + os.makedirs(NB_DEST_DIR) + if not os.path.exists(PAGE_DEST_DIR): + os.makedirs(PAGE_DEST_DIR) + + nblist = sorted(nb for nb in os.listdir(NB_SOURCE_DIR) + if nb.endswith('.ipynb')) + name_map = {nb: nb.rsplit('.', 1)[0].lower() + '.html' + for nb in nblist} + + figsource = abspath_from_here('..', 'notebooks', 'figures') + figdest = abspath_from_here('content', 'figures') + + if os.path.exists(figdest): + shutil.rmtree(figdest) + shutil.copytree(figsource, figdest) + + figurelist = os.listdir(abspath_from_here('content', 'figures')) + figure_map = {os.path.join('figures', fig) : os.path.join('/PythonDataScienceHandbook/figures', fig) + for fig in figurelist} + + for nb in nblist: + base, ext = os.path.splitext(nb) + print('-', nb) + + content = nbformat.read(os.path.join(NB_SOURCE_DIR, nb), + as_version=4) + + if nb == 'Index.ipynb': + # content[0] is the title + # content[1] is the cover image + # content[2] is the license + cells = '1:' + template = 'page' + title = 'Python Data Science Handbook' + content.cells[2].source = INTRO_TEXT + else: + # content[0] is the book information + # content[1] is the navigation bar + # content[2] is the title + cells = '2:' + template = 'booksection' + title = content.cells[2].source + if not title.startswith('#') or len(title.splitlines()) > 1: + raise ValueError('title not found in third cell') + title = title.lstrip('#').strip() + + # put nav below title + content.cells.insert(0, content.cells.pop(2)) + + # Replace internal URLs and figure links in notebook + for cell in content.cells: + if cell.cell_type == 'markdown': + for nbname, htmlname in name_map.items(): + if nbname in cell.source: + cell.source = cell.source.replace(nbname, htmlname) + for figname, newfigname in figure_map.items(): + if figname in cell.source: + cell.source = cell.source.replace(figname, newfigname) + if cell.source.startswith(""): + # Undo replacement of notebook link in the colab badge + cell.source = nb.join(cell.source.rsplit(name_map[nb], 1)) + + nbformat.write(content, os.path.join(NB_DEST_DIR, nb)) + + pagefile = os.path.join(PAGE_DEST_DIR, base + '.md') + htmlfile = base.lower() + '.html' + with open(pagefile, 'w') as f: + f.write(PAGEFILE.format(title=title, + htmlfile=htmlfile, + notebook_file=nb, + template=template, + cells=cells)) + +if __name__ == '__main__': + copy_notebooks() diff --git a/website/fabfile.py b/website/fabfile.py new file mode 100644 index 000000000..79e9a93ce --- /dev/null +++ b/website/fabfile.py @@ -0,0 +1,92 @@ +from fabric.api import * +import fabric.contrib.project as project +import os +import shutil +import sys +import SocketServer + +from pelican.server import ComplexHTTPRequestHandler + +# Local path configuration (can be absolute or relative to fabfile) +env.deploy_path = 'output' +DEPLOY_PATH = env.deploy_path + +# Remote server configuration +production = 'root@localhost:22' +dest_path = '/var/www' + +# Rackspace Cloud Files configuration settings +env.cloudfiles_username = 'my_rackspace_username' +env.cloudfiles_api_key = 'my_rackspace_api_key' +env.cloudfiles_container = 'my_cloudfiles_container' + +# Github Pages configuration +env.github_pages_branch = "master" + +# Port for `serve` +PORT = 8000 + +def clean(): + """Remove generated files""" + if os.path.isdir(DEPLOY_PATH): + shutil.rmtree(DEPLOY_PATH) + os.makedirs(DEPLOY_PATH) + +def build(): + """Build local version of site""" + local('pelican -s pelicanconf.py') + +def rebuild(): + """`build` with the delete switch""" + local('pelican -d -s pelicanconf.py') + +def regenerate(): + """Automatically regenerate site upon file modification""" + local('pelican -r -s pelicanconf.py') + +def serve(): + """Serve site at http://localhost:8000/""" + os.chdir(env.deploy_path) + + class AddressReuseTCPServer(SocketServer.TCPServer): + allow_reuse_address = True + + server = AddressReuseTCPServer(('', PORT), ComplexHTTPRequestHandler) + + sys.stderr.write('Serving on port {0} ...\n'.format(PORT)) + server.serve_forever() + +def reserve(): + """`build`, then `serve`""" + build() + serve() + +def preview(): + """Build production version of site""" + local('pelican -s publishconf.py') + +def cf_upload(): + """Publish to Rackspace Cloud Files""" + rebuild() + with lcd(DEPLOY_PATH): + local('swift -v -A https://auth.api.rackspacecloud.com/v1.0 ' + '-U {cloudfiles_username} ' + '-K {cloudfiles_api_key} ' + 'upload -c {cloudfiles_container} .'.format(**env)) + +@hosts(production) +def publish(): + """Publish to production via rsync""" + local('pelican -s publishconf.py') + project.rsync_project( + remote_dir=dest_path, + exclude=".DS_Store", + local_dir=DEPLOY_PATH.rstrip('/') + '/', + delete=True, + extra_opts='-c', + ) + +def gh_pages(): + """Publish to GitHub Pages""" + rebuild() + local("ghp-import -b {github_pages_branch} {deploy_path} -p".format(**env)) diff --git a/website/pelicanconf.py b/website/pelicanconf.py new file mode 100644 index 000000000..4b744f242 --- /dev/null +++ b/website/pelicanconf.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- # +from __future__ import unicode_literals + +AUTHOR = 'Jake VanderPlas' +SITENAME = 'Python Data Science Handbook' +SITESUBTITLE = u'Essential Tools for Working with Data' +SITEURL = '' +SITESUBURL = 'PythonDataScienceHandbook/' +PATH = 'content' +TIMEZONE = 'America/Los_Angeles' +DEFAULT_LANG = 'en' + +# Feed generation is usually not desired when developing +FEED_ALL_ATOM = None +CATEGORY_FEED_ATOM = None +TRANSLATION_FEED_ATOM = None +AUTHOR_FEED_ATOM = None +AUTHOR_FEED_RSS = None + +# Set the article URL +ARTICLE_URL = 'blog/{date:%Y}/{date:%m}/{date:%d}/{slug}/' +ARTICLE_SAVE_AS = 'blog/{date:%Y}/{date:%m}/{date:%d}/{slug}/index.html' + +DEFAULT_PAGINATION = 10 + +# Uncomment following line if you want document-relative URLs when developing +#RELATIVE_URLS = True + +#MARKUP = ('md', 'ipynb') +#PLUGINS = ['ipynb.markup'] + +MARKUP = ['md'] +PLUGIN_PATHS = ['./plugins', './plugins/pelican-plugins'] +PLUGINS = [ + 'summary', # auto-summarizing articles + 'feed_summary', # use summaries for RSS, not full articles + 'ipynb.liquid', # for embedding notebooks + 'liquid_tags.img', # embedding images + 'liquid_tags.video', # embedding videos + 'liquid_tags.include_code', # including code blocks + 'liquid_tags.literal' +] +IGNORE_FILES = ['.ipynb_checkpoints'] + +# for liquid tags +CODE_DIR = 'downloads/code' +NOTEBOOK_DIR = 'downloads/notebooks' + +# THEME SETTINGS +THEME = './theme/' + +ABOUT_PAGE = '/pages/about.html' +TWITTER_USERNAME = 'jakevdp' +GITHUB_USERNAME = 'jakevdp' +STACKOVERFLOW_ADDRESS = 'http://stackoverflow.com/users/2937831/jakevdp' +AUTHOR_WEBSITE = 'http://vanderplas.com' +AUTHOR_BLOG = 'http://jakevdp.github.io' +AUTHOR_CV = "http://staff.washington.edu/jakevdp/media/pdfs/CV.pdf" +SHOW_ARCHIVES = True +SHOW_FEED = False # Need to address large feeds + +ENABLE_MATHJAX = True + +STATIC_PATHS = ['images', 'figures', 'videos', 'downloads', 'favicon.ico'] + +# Footer info + +LICENSE_URL = "https://github.com/jakevdp/jakevdp.github.io-source/blob/master/LICENSE" +LICENSE = "MIT" diff --git a/website/publishconf.py b/website/publishconf.py new file mode 100644 index 000000000..157ad0ccf --- /dev/null +++ b/website/publishconf.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- # +from __future__ import unicode_literals + +# This file is only used if you use `make publish` or +# explicitly specify it as your config file. + +import os +import sys +sys.path.append(os.curdir) +from pelicanconf import * + +SITEURL = 'http://jakevdp.github.io/PythonDataScienceHandbook' +RELATIVE_URLS = False + +SHOW_FEED = False +FEED_ALL_ATOM = 'feeds/all.atom.xml' +CATEGORY_FEED_ATOM = 'feeds/%s.atom.xml' +FEED_USE_SUMMARY = True # from the feed_summary plugin + +DELETE_OUTPUT_DIRECTORY = True + +DISQUS_SITENAME = "pythonicperambulations" +GOOGLE_ANALYTICS = "UA-34061646-1" diff --git a/website/theme/README.md b/website/theme/README.md new file mode 100644 index 000000000..f79086600 --- /dev/null +++ b/website/theme/README.md @@ -0,0 +1,4 @@ +# Pythonic Perambulations Theme + +This theme was adapted from that at https://github.com/danielfrg/danielfrg.github.io-source; the original is released under the Apache v2.0 license. +Adaptations are contained in this directory. \ No newline at end of file diff --git a/website/theme/static/css/icons.css b/website/theme/static/css/icons.css new file mode 100644 index 000000000..329e1ad75 --- /dev/null +++ b/website/theme/static/css/icons.css @@ -0,0 +1,60 @@ +/* Copied from https://github.com/porterjamesj/crowsfoot */ + +@font-face { + font-family: 'icons'; + src: url('../font/icons.eot?79801659'); + src: url('../font/icons.eot?79801659#iefix') format('embedded-opentype'), + url('../font/icons.woff?79801659') format('woff'), + url('../font/icons.ttf?79801659') format('truetype'), + url('../font/icons.svg?79801659#icons') format('svg'); + font-weight: normal; + font-style: normal; +} +/* Chrome hack: SVG is rendered more smooth in Windozze. 100% magic, uncomment if you need it. */ +/* Note, that will break hinting! In other OS-es font will be not as sharp as it could be */ +/* +@media screen and (-webkit-min-device-pixel-ratio:0) { + @font-face { + font-family: 'icons'; + src: url('../font/icons.svg?79801659#icons') format('svg'); + } +} +*/ + + [class^="icon-"]:before, [class*=" icon-"]:before { + font-family: "icons"; + font-style: normal; + font-weight: normal; + speak: none; + + display: inline-block; + text-decoration: inherit; + width: 1em; + margin-right: .2em; + text-align: center; + /* opacity: .8; */ + + /* For safety - reset parent styles, that can break glyph codes*/ + font-variant: normal; + text-transform: none; + + /* fix buttons height, for twitter bootstrap */ + line-height: 1em; + + /* Animation center compensation - margins should be symmetric */ + /* remove if not needed */ + margin-left: .2em; + + /* you can be more comfortable with increased icons size */ + /* font-size: 120%; */ + + /* Uncomment for 3D effect */ + /* text-shadow: 1px 1px 1px rgba(127, 127, 127, 0.3); */ +} + +.icon-stackoverflow:before { content: '\e032'; } /* '' */ +.icon-twitter:before { content: '\e801'; } /* '' */ +.icon-facebook:before { content: '\e802'; } /* '' */ +.icon-rss:before { content: '\e800'; } /* '' */ +.icon-mail-alt:before { content: '\f0e0'; } /* '' */ +.icon-github:before { content: '\f113'; } /* '' */ \ No newline at end of file diff --git a/website/theme/static/font/icons.eot b/website/theme/static/font/icons.eot new file mode 100644 index 000000000..7c6b12539 Binary files /dev/null and b/website/theme/static/font/icons.eot differ diff --git a/website/theme/static/font/icons.svg b/website/theme/static/font/icons.svg new file mode 100644 index 000000000..d97088804 --- /dev/null +++ b/website/theme/static/font/icons.svg @@ -0,0 +1,17 @@ + + + +Copyright (C) 2012 by original authors @ fontello.com + + + + + + + + + + + + + \ No newline at end of file diff --git a/website/theme/static/font/icons.ttf b/website/theme/static/font/icons.ttf new file mode 100644 index 000000000..121809925 Binary files /dev/null and b/website/theme/static/font/icons.ttf differ diff --git a/website/theme/static/font/icons.woff b/website/theme/static/font/icons.woff new file mode 100644 index 000000000..e693b496e Binary files /dev/null and b/website/theme/static/font/icons.woff differ diff --git a/website/theme/templates/_includes/analytics.html b/website/theme/templates/_includes/analytics.html new file mode 100644 index 000000000..ac8137766 --- /dev/null +++ b/website/theme/templates/_includes/analytics.html @@ -0,0 +1,30 @@ +{% if GOOGLE_UNIVERSAL_ANALYTICS %} + +{% elif GOOGLE_ANALYTICS %} + +{% endif %} diff --git a/website/theme/templates/_includes/disqus_thread.html b/website/theme/templates/_includes/disqus_thread.html new file mode 100644 index 000000000..a347fa408 --- /dev/null +++ b/website/theme/templates/_includes/disqus_thread.html @@ -0,0 +1,17 @@ +{% if DISQUS_SITENAME and SITEURL and article.status != "draft" %} +
+

Comments

+
+ +
+{% endif %} diff --git a/website/theme/templates/about.html b/website/theme/templates/about.html new file mode 100644 index 000000000..98d471047 --- /dev/null +++ b/website/theme/templates/about.html @@ -0,0 +1,43 @@ +{% extends "base.html" %} +{% block title %}{{ page.title }}{% endblock %} +{% block headerimg %}{% if page.headerimg %}{{ page.headerimg }}{% else %}{{ DEFAULT_HEADER_BG }}{% endif %}{% endblock %} + +{% block content %} + +
+
+
+

{{ page.title }}

+ {% if page.date %} + + {% endif %} +
+ +
+ {{ page.content }} +
+ +
+
+{% endblock %} diff --git a/website/theme/templates/archives.html b/website/theme/templates/archives.html new file mode 100644 index 000000000..24efb6fc1 --- /dev/null +++ b/website/theme/templates/archives.html @@ -0,0 +1,28 @@ +{% extends "base.html" %} +{% block title %}Archives{% endblock %} +{% block headerimg %}{{ DEFAUT_HEADER_BG }}{% endblock %} + +{% block content %} +
+
+

Archives and tags

+ +
+ {% for tag, articles in tags %} + {{ tag }} ({{ articles | length }}) + {% endfor %} +
+ + +
+
+{% endblock %} diff --git a/website/theme/templates/article.html b/website/theme/templates/article.html new file mode 100644 index 000000000..f38b8660a --- /dev/null +++ b/website/theme/templates/article.html @@ -0,0 +1,43 @@ +{% extends "base.html" %} +{% block title %}{{ article.title }}{% endblock %} +{% block headerimg %}{% if article.headerimg %}{{ article.headerimg }}{% else %}{{ DEFAULT_HEADER_BG }}{% endif %}{% endblock %} + +{% block extra_head %} +{% if 'angular' in article.include %} + +{% endif %} +{% if 'jquery' in article.include %} + +{% endif %} +{% endblock %} + +{% block content %} +
+
+
+

{{ article.title }}

+ +
+ +
+ {{ article.content }} +
+ +
+
+ {% for tag in article.tags %} + {{ tag }} + {% endfor %} +
+
+
+ + {% include '_includes/disqus_thread.html' %} + +
+ + + +{% endblock %} diff --git a/website/theme/templates/base.html b/website/theme/templates/base.html new file mode 100644 index 000000000..f7eb484dd --- /dev/null +++ b/website/theme/templates/base.html @@ -0,0 +1,94 @@ + + + + + + + + + {% block title %}{% endblock %} | {{ SITENAME }} + + + + + + + + + + + + + + + {% if ENABLE_MATHJAX %} + + + {% endif %} + + {% block extra_head %}{%endblock%} + + + + + +
+ {% block content %}{% endblock %} +
+ + {% include '_includes/analytics.html' %} + + diff --git a/website/theme/templates/booksection.html b/website/theme/templates/booksection.html new file mode 100644 index 000000000..6c012761f --- /dev/null +++ b/website/theme/templates/booksection.html @@ -0,0 +1,34 @@ +{% extends "base.html" %} +{% block title %}{{ page.title }}{% endblock %} +{% block headerimg %}{% if page.headerimg %}{{ page.headerimg }}{% else %}{{ DEFAULT_HEADER_BG }}{% endif %}{% endblock %} + +{% block content %} + +
+

+ +This is an excerpt from the Python Data Science Handbook by Jake VanderPlas; Jupyter notebooks are available on GitHub. +

+

+The text is released under the CC-BY-NC-ND license, and code is released under the MIT license. If you find this content useful, please consider supporting the work by buying the book! +

+
+ + +
+ +
+
+

{{ page.title }}

+ {% if page.date %} + + {% endif %} +
+ +
+ {{ page.content }} +
+ +
+
+{% endblock %} diff --git a/website/theme/templates/index.html b/website/theme/templates/index.html new file mode 100644 index 000000000..fed18ce36 --- /dev/null +++ b/website/theme/templates/index.html @@ -0,0 +1,53 @@ +{% extends "base.html" %} +{% block title %}Home{% endblock %} +{% block headerimg %}{{ DEFAULT_HEADER_BG }}{% endblock %} + +{% block content %} +
+ + {% for article in articles_page.object_list %} +
+
+

{{ article.title }}

+ +
+ +
+ {{ article.summary }} +
+ +
+
+ Read more → +
+ +
+ {% for tag in article.tags %} + {{ tag }} + {% endfor %} +
+ +
+ +
+
+ {% endfor %} + + + +
+ + + +{% endblock %} diff --git a/website/theme/templates/ipynb.css b/website/theme/templates/ipynb.css new file mode 100644 index 000000000..217627202 --- /dev/null +++ b/website/theme/templates/ipynb.css @@ -0,0 +1,47 @@ +{ + max-width: 700px; +} + +.text_cell .prompt { + display: none; +} + +div.cell { + padding: 0; +} + +div.text_cell_render { + padding: 0; +} + +div.prompt { + font-size: 13px; +} + +div.input_prompt { + padding: .7em 0.2em; +} + +div.output_prompt { + padding: .4em .2em; +} + +div.input_area { + margin: .2em 0.4em; + max-width: 580px; +} + +table.dataframe { + font-family: Arial, sans-serif; + font-size: 13px; + line-height: 20px; +} + +table.dataframe th, td { + padding: 4px; + text-align: left; +} + +pre code { + background-color: inherit; +} diff --git a/website/theme/templates/main.css b/website/theme/templates/main.css new file mode 100644 index 000000000..293ec2a60 --- /dev/null +++ b/website/theme/templates/main.css @@ -0,0 +1,300 @@ +body { + margin: 0; + padding: 0; + font: 15px 'Source Sans Pro', sans-serif; + line-height: 1.6em; + color: #222; + text-rendering: optimizeLegibility; + -webkit-font-smoothing: antialiased; +} +a { + color: #007EE5; + text-decoration: none; +} +a:hover { + color: #007EE5; + text-decoration: none; +} +header.main-header { + background: none repeat scroll 0% 0% #205F29; + margin-bottom: 0px; +} +header.main-header a { + color: #fff; +} +header.main-header .container { + max-width: 1000px; +} +header.main-header .container nav a:hover { + background-color: #5C881C; +} +article { + margin: 0; +} +article header.about { + margin-bottom: 0px; + padding-bottom: 0px; +} +article header { + margin-bottom: 20px; + padding-bottom: 20px; +} +article header h1 { + margin-bottom: 2px; + font-weight: 700; + color: #000; +} +article header time { + color: #9E9E9E; + font-size: 0.85em; + float: right; +} +article header time.left { + color: #9E9E9E; + font-size: 0.85em; + float: left; +} +article div.social-links ul { + padding: 0px; +} +article div.social-links li { + display: inline; + font-size: 20px; +} +article div.social-links li a { + color: #000; + padding: 10px; +} +article div.social-links li a:hover { + color: #666; + text-decoration: none; +} +article p { + font-size: 16px; + margin-bottom: 20px; + line-height: 1.6em; +} +article p.note { + background: #f5f5f5; + border: 1px solid #ddd; + padding: 0.533em 0.733em; +} +article p.update { + background-color: #FEEFB3; + border: 1px solid #e6e68a; + padding: 0.533em 0.733em; +} +article p.alert { + background-color: #ffe2e2; + border: 1px solid #ffb2b2; + padding: 0.533em 0.733em; +} +article ul, +article ol { + margin-top: 0px; + margin-bottom: 25px; +} +article li { + font-size: 16px; + line-height: 1.6em; +} +article a:hover { + text-decoration: underline; +} +article blockquote { + border-left: 2px solid #c7c7cc; + color: #666; + margin: 30px 0; + padding: 0 0 0 25px; +} +article img { + max-width: 100%; +} +article code { + color: #333; + background-color: #EEE; + border-radius: 0; + font-size: 13px; +} +article .meta { + font-size: 11px; +} +article .meta a:hover { + text-decoration: none; +} +article .meta div { + margin-bottom: 20px; + display: block; +} +article .meta a.tag { + margin: 0 10px 10px 0; + padding: 1px 12px; + display: inline-block; + font-size: 14px; + color: rgba(0, 0, 0, 0.8); + background: rgba(0, 0, 0, 0.05); +} +article .meta a.tag:hover { + background: rgba(0, 0, 0, 0.15); +} +article .meta a.read_more, +article .meta a.comments_btn { + font-size: 14px; + font-weight: 800; + padding: 10px 20px; + color: #205F29; + background: #FFF; + border: 1px solid #205F29; +} +article .meta a.read_more:hover, +article .meta a.comments_btn:hover { + color: #FFF; + background: #5C881C; +} +.index { + max-width: 700px; +} +.index article header h2 { + font-size: 36px; + margin-bottom: 2px; + font-weight: 700; +} +.index article header h2 a { + color: #000; +} +.index article header h2 a:hover { + color: #007EE5; + text-decoration: none; +} +.index .separator { + padding: 40px 0 0 0; + margin: 0 0 40px 0; + height: 10px; + border-bottom: solid 1px #CCC; +} +.index .pagination { + display: block; + margin-bottom: 100px; +} +.index .pagination .left { + text-align: right; +} +.index .pagination .right { + text-align: left; +} +.index .pagination a { + display: inline-block; + border: 2px solid #5C881C; + margin: 0 5px; + padding: 8px 20px; + font-weight: bold; + color: #5C881C; +} +.index .pagination a:hover { + color: #FFF; + background: #5C881C; +} +.post { + max-width: 700px; +} +.post h2:before { + content: "# "; + font-weight: bold; + color: #DDD; +} +.post h3:before { + content: "## "; + font-weight: bold; + color: #DDD; +} +.post h4:before { + content: "### "; + font-weight: bold; + color: #DDD; +} +.post article .meta { + margin: 50px 0 100px; +} +.list { + max-width: 700px; +} +.list ul.double-list { + margin: 0 auto 60px; + padding: 0; + list-style-type: none; +} +.list ul.double-list li { + padding: 5px 0; +} +.list ul.double-list li h2 { + font-size: 1em; + display: inline; + font-weight: normal; +} +.list ul.double-list li span { + font-family: sans-serif; + text-transform: uppercase; + text-align: right; + float: right; + padding-top: 3px; + font-size: 12px; + color: #999; +} +.full-width-content { + padding-top: 10px; + padding-left: 0px; + padding-right: 0px; + margin-left: -20px; + margin-right: -20px; +} +.col-xs-1, +.col-sm-1, +.col-md-1, +.col-lg-1, +.col-xs-2, +.col-sm-2, +.col-md-2, +.col-lg-2, +.col-xs-3, +.col-sm-3, +.col-md-3, +.col-lg-3, +.col-xs-4, +.col-sm-4, +.col-md-4, +.col-lg-4, +.col-xs-5, +.col-sm-5, +.col-md-5, +.col-lg-5, +.col-xs-6, +.col-sm-6, +.col-md-6, +.col-lg-6, +.col-xs-7, +.col-sm-7, +.col-md-7, +.col-lg-7, +.col-xs-8, +.col-sm-8, +.col-md-8, +.col-lg-8, +.col-xs-9, +.col-sm-9, +.col-md-9, +.col-lg-9, +.col-xs-10, +.col-sm-10, +.col-md-10, +.col-lg-10, +.col-xs-11, +.col-sm-11, +.col-md-11, +.col-lg-11, +.col-xs-12, +.col-sm-12, +.col-md-12, +.col-lg-12 { + padding-right: 0px; + padding-left: 0px; +} diff --git a/website/theme/templates/main.less b/website/theme/templates/main.less new file mode 100644 index 000000000..a11ef6db5 --- /dev/null +++ b/website/theme/templates/main.less @@ -0,0 +1,316 @@ +// out: ./main.css, compress: true + +@text-color: #222; +@link-color: #007EE5; + +body { + margin: 0; + padding: 0; + font: 15px 'Source Sans Pro', sans-serif; + line-height: 1.6em; + color: @text-color; + text-rendering: optimizeLegibility; + -webkit-font-smoothing: antialiased; +} + +a { + color: @link-color; + text-decoration: none; +} + +a:hover { + color: @link-color; + text-decoration: none; +} + +header.main-header { + background: none repeat scroll 0% 0% #205F29; + margin-bottom: 0px; + + a { + color: #fff; + } + + .container { + max-width: 1000px; + + nav { + a:hover { + background-color: #5C881C; + } + } + } +} + +article { + margin: 0; + + header.about { + margin-bottom: 0px; + padding-bottom: 0px; + } + + header { + margin-bottom: 20px; + padding-bottom: 20px; + + h1 { + margin-bottom: 2px; + font-weight: 700; + color: #000; + } + + time { + color: #9E9E9E; + font-size: 0.85em; + float: right; + } + + time.left { + color: #9E9E9E; + font-size: 0.85em; + float: left; + } + } + + div.social-links { + ul { + padding: 0px; + } + li { + display: inline; + font-size: 20px; + a { + color: #000; + padding: 10px; + } + a:hover { + color: #666; + text-decoration: none; + } + } + } + + p { + font-size: 16px; + margin-bottom: 20px; + line-height: 1.6em; + } + + p.note { + background: #f5f5f5; + border: 1px solid #ddd; + padding: 0.533em 0.733em; + } + + p.update { + background-color: #FEEFB3; + border: 1px solid #e6e68a; + padding: 0.533em 0.733em; + } + + p.alert { + background-color: #ffe2e2; + border: 1px solid #ffb2b2; + padding: 0.533em 0.733em; + } + + ul, ol { + margin-top: 0px; + margin-bottom: 25px; + } + + li { + font-size: 16px; + line-height: 1.6em; + } + + a:hover { + text-decoration: underline; + } + + blockquote { + border-left: 2px solid #c7c7cc; + color: #666; + margin: 30px 0; + padding: 0 0 0 25px; + } + + img { + max-width: 100%; + } + + code { + color: #333; + background-color: #EEE; + border-radius: 0; + font-size: 13px; + } + + .meta { + font-size: 11px; + + a:hover { + text-decoration: none; + } + + div { + margin-bottom: 20px; + display: block; + } + + a.tag { + margin: 0 10px 10px 0; + padding: 1px 12px; + display: inline-block; + font-size: 14px; + color: rgba(0,0,0,0.8);; + background: rgba(0,0,0,0.05); + } + + a.tag:hover { + background: rgba(0,0,0,0.15); + } + + a.read_more, a.comments_btn { + font-size: 14px; + font-weight: 800; + padding: 10px 20px; + color: #205F29; + background: #FFF; + border: 1px solid #205F29; + } + + a.read_more:hover, a.comments_btn:hover { + color: #FFF; + background: #5C881C; + } + } +} + +.index { + max-width: 700px; + + article { + header { + h2 { + font-size: 36px; + margin-bottom: 2px; + font-weight: 700; + + a { + color: #000; + } + a:hover { + color: @link-color; + text-decoration: none; + } + } + } + } + + .separator { + padding: 40px 0 0 0; + margin: 0 0 40px 0; + height: 10px; + border-bottom: solid 1px #CCC; + } + + .pagination { + display: block; + margin-bottom: 100px; + + .left { + text-align: right; + } + + .right { + text-align: left; + } + + a { + display: inline-block; + border: 2px solid #5C881C; + margin: 0 5px; + padding: 8px 20px; + font-weight: bold; + color: #5C881C; + } + + a:hover { + color: #FFF; + background: #5C881C; + } + } +} + +.post { + max-width: 700px; + + h2:before { + content: "# "; + font-weight: bold; + color: #DDD; + } + + h3:before { + content: "## "; + font-weight: bold; + color: #DDD; + } + + h4:before { + content: "### "; + font-weight: bold; + color: #DDD; + } + + article { + .meta { + margin: 50px 0 100px; + } + } +} + +.list { + max-width: 700px; + + ul.double-list { + margin: 0 auto 60px; + padding: 0; + list-style-type: none; + + li { + padding: 5px 0; + + h2 { + font-size: 1em; + display: inline; + font-weight: normal; + } + + span { + font-family: sans-serif; + text-transform: uppercase; + text-align: right; + float: right; + padding-top: 3px; + font-size: 12px; + color: #999; + } + } + } +} + +.full-width-content { + padding-top: 10px; + padding-left: 0px; + padding-right: 0px; + margin-left: -20px; + margin-right: -20px; +} + +.col-xs-1, .col-sm-1, .col-md-1, .col-lg-1, .col-xs-2, .col-sm-2, .col-md-2, .col-lg-2, .col-xs-3, .col-sm-3, .col-md-3, .col-lg-3, .col-xs-4, .col-sm-4, .col-md-4, .col-lg-4, .col-xs-5, .col-sm-5, .col-md-5, .col-lg-5, .col-xs-6, .col-sm-6, .col-md-6, .col-lg-6, .col-xs-7, .col-sm-7, .col-md-7, .col-lg-7, .col-xs-8, .col-sm-8, .col-md-8, .col-lg-8, .col-xs-9, .col-sm-9, .col-md-9, .col-lg-9, .col-xs-10, .col-sm-10, .col-md-10, .col-lg-10, .col-xs-11, .col-sm-11, .col-md-11, .col-lg-11, .col-xs-12, .col-sm-12, .col-md-12, .col-lg-12 { + padding-right: 0px; + padding-left: 0px; +} diff --git a/website/theme/templates/page.html b/website/theme/templates/page.html new file mode 100644 index 000000000..97b1e59df --- /dev/null +++ b/website/theme/templates/page.html @@ -0,0 +1,22 @@ +{% extends "base.html" %} +{% block title %}{{ page.title }}{% endblock %} +{% block headerimg %}{% if page.headerimg %}{{ page.headerimg }}{% else %}{{ DEFAULT_HEADER_BG }}{% endif %}{% endblock %} + +{% block content %} +
+ +
+
+

{{ page.title }}

+ {% if page.date %} + + {% endif %} +
+ +
+ {{ page.content }} +
+ +
+
+{% endblock %} diff --git a/website/theme/templates/pygments.css b/website/theme/templates/pygments.css new file mode 100644 index 000000000..98db4dd5f --- /dev/null +++ b/website/theme/templates/pygments.css @@ -0,0 +1,61 @@ +.highlight .hll { background-color: #ffffcc } +.highlight .c { color: #60a0b0; font-style: italic } /* Comment */ +.highlight .err { border: 1px solid #FF0000 } /* Error */ +.highlight .k { color: #007020; font-weight: bold } /* Keyword */ +.highlight .o { color: #666666 } /* Operator */ +.highlight .cm { color: #60a0b0; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #007020 } /* Comment.Preproc */ +.highlight .c1 { color: #60a0b0; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #60a0b0; background-color: #fff0f0 } /* Comment.Special */ +.highlight .gd { color: #A00000 } /* Generic.Deleted */ +.highlight .ge { font-style: italic } /* Generic.Emph */ +.highlight .gr { color: #FF0000 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #00A000 } /* Generic.Inserted */ +.highlight .go { color: #808080 } /* Generic.Output */ +.highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ +.highlight .gs { font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #0040D0 } /* Generic.Traceback */ +.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #007020 } /* Keyword.Pseudo */ +.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #902000 } /* Keyword.Type */ +.highlight .m { color: #40a070 } /* Literal.Number */ +.highlight .s { color: #4070a0 } /* Literal.String */ +.highlight .na { color: #4070a0 } /* Name.Attribute */ +.highlight .nb { color: #007020 } /* Name.Builtin */ +.highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ +.highlight .no { color: #60add5 } /* Name.Constant */ +.highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ +.highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ +.highlight .ne { color: #007020 } /* Name.Exception */ +.highlight .nf { color: #06287e } /* Name.Function */ +.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ +.highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ +.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #bb60d5 } /* Name.Variable */ +.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ +.highlight .w { color: #bbbbbb } /* Text.Whitespace */ +.highlight .mf { color: #40a070 } /* Literal.Number.Float */ +.highlight .mh { color: #40a070 } /* Literal.Number.Hex */ +.highlight .mi { color: #40a070 } /* Literal.Number.Integer */ +.highlight .mo { color: #40a070 } /* Literal.Number.Oct */ +.highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ +.highlight .sc { color: #4070a0 } /* Literal.String.Char */ +.highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #4070a0 } /* Literal.String.Double */ +.highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ +.highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ +.highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ +.highlight .sx { color: #c65d09 } /* Literal.String.Other */ +.highlight .sr { color: #235388 } /* Literal.String.Regex */ +.highlight .s1 { color: #4070a0 } /* Literal.String.Single */ +.highlight .ss { color: #517918 } /* Literal.String.Symbol */ +.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ +.highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ +.highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ +.highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ +.highlight .il { color: #40a070 } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/website/theme/templates/tag.html b/website/theme/templates/tag.html new file mode 100644 index 000000000..1e83db83e --- /dev/null +++ b/website/theme/templates/tag.html @@ -0,0 +1,22 @@ +{% extends "base.html" %} +{% block title %}Archives{% endblock %} +{% block headerimg %}{{ DEFAUT_HEADER_BG }}{% endblock %} + +{% block content %} +
+
+

Tag: {{ tag }}

+ + +
+
+{% endblock %}