diff --git a/notebooks/18-Missing-Values.ipynb b/notebooks/18-Missing-Values.ipynb new file mode 100644 index 0000000..0e9f950 --- /dev/null +++ b/notebooks/18-Missing-Values.ipynb @@ -0,0 +1,711 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescoreyear
TripoliNoureddinSadawi4222015
BirminghamJosefBaker3222009
LondonMikeBrooks2332013
IslamabadAliShah2232016
New YorkLukeBrown4012014
\n", + "
" + ], + "text/plain": [ + " first_name last_name score year\n", + "Tripoli Noureddin Sadawi 422 2015\n", + "Birmingham Josef Baker 322 2009\n", + "London Mike Brooks 233 2013\n", + "Islamabad Ali Shah 223 2016\n", + "New York Luke Brown 401 2014" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# here we create a simple dataframe\n", + "raw_data = {'first_name': ['Noureddin', 'Josef', 'Mike', 'Ali', 'Luke'],\n", + " 'last_name': ['Sadawi', 'Baker', 'Brooks', 'Shah', 'Brown'],\n", + " 'year': [2015,2009,2013,2016,2014],\n", + " 'score': [422,322,233,223,401]}\n", + "df = pd.DataFrame(raw_data, index = ['Tripoli', 'Birmingham', 'London', 'Islamabad', 'New York'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "m = df[df < 240]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescoreyear
TripoliNoureddinSadawi1.01.0
BirminghamJosefBaker1.01.0
LondonMikeBrooks233.01.0
IslamabadAliShah223.01.0
New YorkLukeBrown1.01.0
\n", + "
" + ], + "text/plain": [ + " first_name last_name score year\n", + "Tripoli Noureddin Sadawi 1.0 1.0\n", + "Birmingham Josef Baker 1.0 1.0\n", + "London Mike Brooks 233.0 1.0\n", + "Islamabad Ali Shah 223.0 1.0\n", + "New York Luke Brown 1.0 1.0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# m is a dataframe that contains missing values ,, the NaNs\n", + "m" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m.dropna" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescoreyear
TripoliNoureddinSadawiNaNNaN
BirminghamJosefBakerNaNNaN
LondonMikeBrooks233.0NaN
IslamabadAliShah223.0NaN
New YorkLukeBrownNaNNaN
\n", + "
" + ], + "text/plain": [ + " first_name last_name score year\n", + "Tripoli Noureddin Sadawi NaN NaN\n", + "Birmingham Josef Baker NaN NaN\n", + "London Mike Brooks 233.0 NaN\n", + "Islamabad Ali Shah 223.0 NaN\n", + "New York Luke Brown NaN NaN" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m.dropna(how='all')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescore
TripoliNoureddinSadawiNaN
BirminghamJosefBakerNaN
LondonMikeBrooks233.0
IslamabadAliShah223.0
New YorkLukeBrownNaN
\n", + "
" + ], + "text/plain": [ + " first_name last_name score\n", + "Tripoli Noureddin Sadawi NaN\n", + "Birmingham Josef Baker NaN\n", + "London Mike Brooks 233.0\n", + "Islamabad Ali Shah 223.0\n", + "New York Luke Brown NaN" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m.dropna(how='all',axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_name
TripoliNoureddinSadawi
BirminghamJosefBaker
LondonMikeBrooks
IslamabadAliShah
New YorkLukeBrown
\n", + "
" + ], + "text/plain": [ + " first_name last_name\n", + "Tripoli Noureddin Sadawi\n", + "Birmingham Josef Baker\n", + "London Mike Brooks\n", + "Islamabad Ali Shah\n", + "New York Luke Brown" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m.dropna(how='all',axis=1,thresh=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescoreyear
TripoliNoureddinSadawi1.01.0
BirminghamJosefBaker1.01.0
LondonMikeBrooks233.01.0
IslamabadAliShah223.01.0
New YorkLukeBrown1.01.0
\n", + "
" + ], + "text/plain": [ + " first_name last_name score year\n", + "Tripoli Noureddin Sadawi 1.0 1.0\n", + "Birmingham Josef Baker 1.0 1.0\n", + "London Mike Brooks 233.0 1.0\n", + "Islamabad Ali Shah 223.0 1.0\n", + "New York Luke Brown 1.0 1.0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m.fillna(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "m.fillna(1,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescoreyear
TripoliNoureddinSadawi1.01.0
BirminghamJosefBaker1.01.0
LondonMikeBrooks233.01.0
IslamabadAliShah223.01.0
New YorkLukeBrown1.01.0
\n", + "
" + ], + "text/plain": [ + " first_name last_name score year\n", + "Tripoli Noureddin Sadawi 1.0 1.0\n", + "Birmingham Josef Baker 1.0 1.0\n", + "London Mike Brooks 233.0 1.0\n", + "Islamabad Ali Shah 223.0 1.0\n", + "New York Luke Brown 1.0 1.0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/notebooks/19-Sorting.ipynb b/notebooks/19-Sorting.ipynb new file mode 100644 index 0000000..56ef0e5 --- /dev/null +++ b/notebooks/19-Sorting.ipynb @@ -0,0 +1,600 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1col2col3col4
r1-0.536807-0.404254-0.159271-0.775606
r2-0.9401910.081161-0.2691350.081723
r30.062271-0.340631-1.7683700.303790
r40.471436-2.4761700.4572630.547299
r50.239387-0.2107410.2276960.294547
\n", + "
" + ], + "text/plain": [ + " col1 col2 col3 col4\n", + "r1 -0.536807 -0.404254 -0.159271 -0.775606\n", + "r2 -0.940191 0.081161 -0.269135 0.081723\n", + "r3 0.062271 -0.340631 -1.768370 0.303790\n", + "r4 0.471436 -2.476170 0.457263 0.547299\n", + "r5 0.239387 -0.210741 0.227696 0.294547" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# here we create a simple dataframe with random values\n", + "raw_data = {\n", + " 'col1' : np.random.normal(size=5),\n", + " 'col2' : np.random.normal(size=5),\n", + " 'col3' : np.random.normal(size=5),\n", + " 'col4' : np.random.normal(size=5)\n", + "}\n", + "my_df = pd.DataFrame(raw_data, index = ['r1', 'r2', 'r3', 'r4', 'r5'])\n", + "my_df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1col2col3col4
r2-0.9401910.081161-0.2691350.081723
r1-0.536807-0.404254-0.159271-0.775606
r30.062271-0.340631-1.7683700.303790
r50.239387-0.2107410.2276960.294547
r40.471436-2.4761700.4572630.547299
\n", + "
" + ], + "text/plain": [ + " col1 col2 col3 col4\n", + "r2 -0.940191 0.081161 -0.269135 0.081723\n", + "r1 -0.536807 -0.404254 -0.159271 -0.775606\n", + "r3 0.062271 -0.340631 -1.768370 0.303790\n", + "r5 0.239387 -0.210741 0.227696 0.294547\n", + "r4 0.471436 -2.476170 0.457263 0.547299" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_df.sort_values(by='col1')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1col2col3col4
r2-0.9401910.081161-0.2691350.081723
r1-0.536807-0.404254-0.159271-0.775606
r30.062271-0.340631-1.7683700.303790
r50.239387-0.2107410.2276960.294547
r40.471436-2.4761700.4572630.547299
\n", + "
" + ], + "text/plain": [ + " col1 col2 col3 col4\n", + "r2 -0.940191 0.081161 -0.269135 0.081723\n", + "r1 -0.536807 -0.404254 -0.159271 -0.775606\n", + "r3 0.062271 -0.340631 -1.768370 0.303790\n", + "r5 0.239387 -0.210741 0.227696 0.294547\n", + "r4 0.471436 -2.476170 0.457263 0.547299" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_df.sort_values(by=['col1','col4'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescoreyear
TripoliNoureddinSadawi4222015
BirminghamJosefBaker3222009
LondonMikeBrooks2332013
IslamabadAliShah2232016
New YorkLukeBrown4012014
\n", + "
" + ], + "text/plain": [ + " first_name last_name score year\n", + "Tripoli Noureddin Sadawi 422 2015\n", + "Birmingham Josef Baker 322 2009\n", + "London Mike Brooks 233 2013\n", + "Islamabad Ali Shah 223 2016\n", + "New York Luke Brown 401 2014" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data = {'first_name': ['Noureddin', 'Josef', 'Mike', 'Ali', 'Luke'],\n", + " 'last_name': ['Sadawi', 'Baker', 'Brooks', 'Shah', 'Brown'],\n", + " 'year': [2015,2009,2013,2016,2014],\n", + " 'score': [422,322,233,223,401]}\n", + "df = pd.DataFrame(raw_data, index = ['Tripoli', 'Birmingham', 'London', 'Islamabad', 'New York'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescoreyear
IslamabadAliShah2232016
TripoliNoureddinSadawi4222015
New YorkLukeBrown4012014
LondonMikeBrooks2332013
BirminghamJosefBaker3222009
\n", + "
" + ], + "text/plain": [ + " first_name last_name score year\n", + "Islamabad Ali Shah 223 2016\n", + "Tripoli Noureddin Sadawi 422 2015\n", + "New York Luke Brown 401 2014\n", + "London Mike Brooks 233 2013\n", + "Birmingham Josef Baker 322 2009" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(by='last_name',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namelast_namescoreyear
BirminghamJosefBaker3222009
LondonMikeBrooks2332013
New YorkLukeBrown4012014
TripoliNoureddinSadawi4222015
IslamabadAliShah2232016
\n", + "
" + ], + "text/plain": [ + " first_name last_name score year\n", + "Birmingham Josef Baker 322 2009\n", + "London Mike Brooks 233 2013\n", + "New York Luke Brown 401 2014\n", + "Tripoli Noureddin Sadawi 422 2015\n", + "Islamabad Ali Shah 223 2016" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(by=['last_name','score'])" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}