From 214c8901d9a9fe49e5744a669f6da233326cf3c4 Mon Sep 17 00:00:00 2001 From: "H.C. Chen" Date: Sun, 8 Jul 2018 05:50:41 +0800 Subject: [PATCH 01/13] aa --- Untitled.ipynb | 63 ++ .../03.02-Data-Indexing-and-Selection.ipynb | 671 ++++++++++++++---- notebooks/03.03-Operations-in-Pandas.ipynb | 406 ++++++++--- notebooks/03.04-Missing-Values.ipynb | 137 ++-- 4 files changed, 961 insertions(+), 316 deletions(-) create mode 100644 Untitled.ipynb diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 000000000..791c5b373 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/03.02-Data-Indexing-and-Selection.ipynb b/notebooks/03.02-Data-Indexing-and-Selection.ipynb index 7707be226..416af5063 100644 --- a/notebooks/03.02-Data-Indexing-and-Selection.ipynb +++ b/notebooks/03.02-Data-Indexing-and-Selection.ipynb @@ -59,10 +59,37 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { @@ -74,7 +101,7 @@ "dtype: float64" ] }, - "execution_count": 1, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -89,9 +116,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -118,9 +143,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -140,9 +163,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -162,9 +183,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -192,9 +211,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -242,9 +259,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -268,9 +283,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -293,9 +306,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -318,9 +329,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -361,9 +370,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -387,9 +394,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -410,9 +415,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -445,9 +448,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -467,9 +468,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -498,9 +497,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -520,9 +517,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -574,15 +569,26 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, + "execution_count": 8, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -630,15 +636,14 @@ "Texas 695662 26448193" ] }, - "execution_count": 18, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "area = pd.Series({'California': 423967, 'Texas': 695662,\n", - " 'New York': 141297, 'Florida': 170312,\n", - " 'Illinois': 149995})\n", + "area = pd.Series({'New York': 141297, 'Florida': 170312,\n", + " 'Illinois': 149995, 'California': 423967, 'Texas': 695662})\n", "pop = pd.Series({'California': 38332521, 'Texas': 26448193,\n", " 'New York': 19651127, 'Florida': 19552860,\n", " 'Illinois': 12882135})\n", @@ -656,9 +661,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -690,9 +693,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -724,9 +725,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -748,16 +747,14 @@ "metadata": {}, "source": [ "Though this is a useful shorthand, keep in mind that it does not work for all cases!\n", - "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this attribute-style access is not possible.\n", + "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this **attribute-style** access is not possible.\n", "For example, the ``DataFrame`` has a ``pop()`` method, so ``data.pop`` will point to this rather than the ``\"pop\"`` column:" ] }, { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -780,20 +777,31 @@ "source": [ "In particular, you should avoid the temptation to try column assignment via attribute (i.e., use ``data['pop'] = z`` rather than ``data.pop = z``).\n", "\n", - "Like with the ``Series`` objects discussed earlier, this dictionary-style syntax can also be used to modify the object, in this case adding a new column:" + "Like with the ``Series`` objects discussed earlier, this **dictionary-style** syntax can also be used to modify the object, in this case adding a new column:" ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -847,7 +855,7 @@ "Texas 695662 26448193 38.018740" ] }, - "execution_count": 23, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -876,11 +884,22 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, + "execution_count": 21, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 4.23967000e+05 3.83325210e+07 9.04139261e+01]\n", + " [ 1.70312000e+05 1.95528600e+07 1.14806121e+02]\n", + " [ 1.49995000e+05 1.28821350e+07 8.58837628e+01]\n", + " [ 1.41297000e+05 1.96511270e+07 1.39076746e+02]\n", + " [ 6.95662000e+05 2.64481930e+07 3.80187404e+01]]\n", + "data :> values type tib. \\ ==> ()\n", + ".values 就是個簡單的 ndarray\n" + ] + }, { "data": { "text/plain": [ @@ -891,12 +910,14 @@ " [ 6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])" ] }, - "execution_count": 24, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f data :> values . cr\n", + "%f data :> values type tib. \\ .values 就是個簡單的 ndarray\n", "data.values" ] }, @@ -910,15 +931,34 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, + "execution_count": 20, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".T 之後,應該還是個 df\n", + "data :> T type tib. \\ ==> ()\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -966,13 +1006,15 @@ "density 9.041393e+01 1.148061e+02 8.588376e+01 1.390767e+02 3.801874e+01" ] }, - "execution_count": 25, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.T" + "%f \\ .T 之後,應該還是個 df \n", + "%f data :> T type tib.\n", + "data.T " ] }, { @@ -986,9 +1028,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1015,9 +1055,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1052,15 +1090,34 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": false - }, + "execution_count": 24, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data :> iloc[:3,:2] type tib. \\ ==> ()\n", + "這樣擷取的還是個 data frame\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1096,13 +1153,14 @@ "Illinois 149995 12882135" ] }, - "execution_count": 28, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.iloc[:3, :2]" + "%f data :> iloc[:3,:2] type tib. \\ 這樣擷取的還是個 data frame \n", + "data.iloc[:3, :2]\n" ] }, { @@ -1115,9 +1173,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1178,15 +1234,40 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false - }, + "execution_count": 49, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: \n", + ".ix is deprecated. Please use\n", + ".loc for label based indexing or\n", + ".iloc for positional indexing\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1222,7 +1303,7 @@ "Illinois 149995 12882135" ] }, - "execution_count": 30, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -1231,11 +1312,85 @@ "data.ix[:3, :'pop']" ] }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data.ix[:3, :'pop'] 改寫成 data.iloc[:3].loc[:,:'pop'] 就是同樣的意思\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areapop
California42396738332521
Florida17031219552860
Illinois14999512882135
\n", + "
" + ], + "text/plain": [ + " area pop\n", + "California 423967 38332521\n", + "Florida 170312 19552860\n", + "Illinois 149995 12882135" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ data.ix[:3, :'pop'] 改寫成 data.iloc[:3].loc[:,:'pop'] 就是同樣的意思\n", + "data.iloc[:3].loc[:,:'pop']" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects.\n", + "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects. 我有看到警告說 .ix will be deprecated soon 的確,混和的結果又把兩種 style 的模糊性帶回來了!\n", "\n", "Any of the familiar NumPy-style data access patterns can be used within these indexers.\n", "For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:" @@ -1243,15 +1398,35 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 71, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "column indexing 一定要放在逗點之後\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1281,15 +1456,161 @@ "New York 19651127 139.076746" ] }, - "execution_count": 31, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f \\ column indexing 一定要放在逗點之後\n", "data.loc[data.density > 100, ['pop', 'density']]" ] }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "column indexing 一定要放在逗點之後,\n", + ". . . 或者要用 fancy indexing 的形式—放在 list 裡面\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
popdensity
Florida19552860114.806121
New York19651127139.076746
\n", + "
" + ], + "text/plain": [ + " pop density\n", + "Florida 19552860 114.806121\n", + "New York 19651127 139.076746" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ column indexing 一定要放在逗點之後,\n", + "%f \\ . . . 或者要用 fancy indexing 的形式—放在 list 裡面\n", + "data[['pop','density']][data.density > 100]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areapopdensity
Florida17031219552860114.806121
New York14129719651127139.076746
\n", + "
" + ], + "text/plain": [ + " area pop density\n", + "Florida 170312 19552860 114.806121\n", + "New York 141297 19651127 139.076746" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n", + "data.loc[data.density > 100].loc[:'area']" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1300,9 +1621,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1390,15 +1709,26 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false - }, + "execution_count": 25, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1431,15 +1761,68 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 33, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f \\ slicing refers to rows\n", "data['Florida':'Illinois']" ] }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "indexing refers to columns 這句話沒講清楚,\n", + "California 90.413926\n", + "Florida 114.806121\n", + "Illinois 85.883763\n", + "New York 139.076746\n", + "Texas 38.018740\n", + "Name: density, dtype: float64\n", + "以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", + "注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", + "所以作者要強調它,反正就是這樣!\n", + "\n", + "Failed in (compiling=False): 'lalalala'\n", + "Body:\n", + "push(pop()['lalalala'])\n", + "以上當 key 不認得時,讓 python 接著處裡就出錯了!\n" + ] + }, + { + "data": { + "text/plain": [ + "California 90.413926\n", + "Florida 114.806121\n", + "Illinois 85.883763\n", + "New York 139.076746\n", + "Texas 38.018740\n", + "Name: density, dtype: float64" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ indexing refers to columns 這句話沒講清楚,\n", + "%f data :> ['density'] . cr \\ 以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", + "%f \\ 注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", + "%f \\ 所以作者要強調它,反正就是這樣!\n", + "%f data :> ['lalalala'] . cr \n", + "%f \\ 以上當 key 不認得時,讓 python 接著處裡就出錯了!\n", + "data['density']" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1449,15 +1832,28 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 26, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1490,7 +1886,7 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 34, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1508,15 +1904,28 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 39, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1549,7 +1958,7 @@ "New York 141297 19651127 139.076746" ] }, - "execution_count": 35, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1591,9 +2000,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/03.03-Operations-in-Pandas.ipynb b/notebooks/03.03-Operations-in-Pandas.ipynb index ac4b1eb37..6a0806463 100644 --- a/notebooks/03.03-Operations-in-Pandas.ipynb +++ b/notebooks/03.03-Operations-in-Pandas.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -33,7 +62,8 @@ "One of the essential pieces of NumPy is the ability to perform quick element-wise operations, both with basic arithmetic (addition, subtraction, multiplication, etc.) and with more sophisticated operations (trigonometric functions, exponential and logarithmic functions, etc.).\n", "Pandas inherits much of this functionality from NumPy, and the ufuncs that we introduced in [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) are key to this.\n", "\n", - "Pandas includes a couple useful twists, however: for unary operations like negation and trigonometric functions, these ufuncs will *preserve index and column labels* in the output, and for binary operations such as addition and multiplication, Pandas will automatically *align indices* when passing the objects to the ufunc.\n", + "Pandas includes a couple useful twists, however: for unary operations like negation and trigonometric functions, these ufuncs will *preserve index and column labels* in the output, and for binary operations such as addition and multiplication, Pandas will automatically *align indices* when passing the objects to the ufunc. 不知所云,往下看吧。。。\n", + "\n", "This means that keeping the context of data and combining data from different sources–both potentially error-prone tasks with raw NumPy arrays–become essentially foolproof ones with Pandas.\n", "We will additionally see that there are well-defined operations between one-dimensional ``Series`` structures and two-dimensional ``DataFrame`` structures." ] @@ -51,9 +81,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -62,10 +90,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, "outputs": [ { "data": { @@ -74,31 +100,42 @@ "1 3\n", "2 7\n", "3 4\n", - "dtype: int64" + "dtype: int32" ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rng = np.random.RandomState(42)\n", - "ser = pd.Series(rng.randint(0, 10, 4))\n", + "rng = np.random.RandomState(42) named # 42 是 seed, 取得產生器 rng.\n", + "ser = pd.Series(rng.randint(0, 10, 4)) # ser 就是個 fancy array\n", "ser" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -142,7 +179,7 @@ "2 7 2 5 4" ] }, - "execution_count": 3, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -162,11 +199,20 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, + "execution_count": 7, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 6\n", + "1 3\n", + "2 7\n", + "3 4\n", + "dtype: int32\n" + ] + }, { "data": { "text/plain": [ @@ -177,12 +223,13 @@ "dtype: float64" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f ser . cr\n", "np.exp(ser)" ] }, @@ -195,15 +242,36 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, + "execution_count": 10, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " A B C D\n", + "0 6 9 2 6\n", + "1 7 4 3 7\n", + "2 7 2 5 4\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -247,12 +315,13 @@ "2 -0.707107 1.000000e+00 -0.707107 1.224647e-16" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f df . cr\n", "np.sin(df * np.pi / 4)" ] }, @@ -284,10 +353,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, + "execution_count": 21, + "metadata": {}, "outputs": [], "source": [ "area = pd.Series({'Alaska': 1723337, 'Texas': 695662,\n", @@ -305,10 +372,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, + "execution_count": 22, + "metadata": {}, "outputs": [ { "data": { @@ -320,7 +385,7 @@ "dtype: float64" ] }, - "execution_count": 7, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -338,10 +403,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, + "execution_count": 23, + "metadata": {}, "outputs": [ { "data": { @@ -349,7 +412,7 @@ "Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')" ] }, - "execution_count": 8, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -358,6 +421,41 @@ "area.index | population.index" ] }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "我都還不知道 set 運算可以這樣做\n", + "{1, 2, 3, 4, 5}\n", + "peforth 的 OR 就是 python 的 bitwise | 運算\n", + "如過這樣 list(s1) | list(s2) 就是 TypeError: unsupported operand types\n" + ] + }, + { + "data": { + "text/plain": [ + "{1, 2, 3, 4, 5}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 我都還不知道 set 運算可以這樣做\n", + "s1 = {1,2,4}\n", + "s2 = {1,3,5}\n", + "%f s1 s2 OR . cr \\ peforth 的 OR 就是 python 的 bitwise | 運算\n", + "%f \\ 如過這樣 list(s1) | list(s2) 就是 TypeError: unsupported operand types\n", + "s1 | s2" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -368,10 +466,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, + "execution_count": 25, + "metadata": {}, "outputs": [ { "data": { @@ -383,7 +479,7 @@ "dtype: float64" ] }, - "execution_count": 9, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -404,10 +500,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, + "execution_count": 26, + "metadata": {}, "outputs": [ { "data": { @@ -419,7 +513,7 @@ "dtype: float64" ] }, - "execution_count": 10, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -439,15 +533,26 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, + "execution_count": 27, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -477,7 +582,7 @@ "1 5 1" ] }, - "execution_count": 11, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -490,15 +595,26 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, + "execution_count": 28, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -538,7 +654,7 @@ "2 9 2 6" ] }, - "execution_count": 12, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -551,15 +667,26 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, + "execution_count": 29, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -599,7 +726,7 @@ "2 NaN NaN NaN" ] }, - "execution_count": 13, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -619,15 +746,26 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, + "execution_count": 30, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -667,7 +805,7 @@ "2 6.5 13.5 10.5" ] }, - "execution_count": 14, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -707,10 +845,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, + "execution_count": 31, + "metadata": {}, "outputs": [ { "data": { @@ -720,7 +856,7 @@ " [6, 1, 3, 8]])" ] }, - "execution_count": 15, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -732,10 +868,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, + "execution_count": 32, + "metadata": {}, "outputs": [ { "data": { @@ -745,7 +879,7 @@ " [ 3, -7, 1, 4]])" ] }, - "execution_count": 16, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -765,15 +899,26 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, + "execution_count": 33, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -817,7 +962,7 @@ "2 3 -7 1 4" ] }, - "execution_count": 17, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -836,15 +981,41 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, + "execution_count": 38, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Q R S T\n", + "0 3 8 2 4\n", + "1 2 6 4 8\n", + "2 6 1 3 8\n", + "\n", + "0 8\n", + "1 6\n", + "2 1\n", + "Name: R, dtype: int32\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -888,12 +1059,14 @@ "2 5 0 2 7" ] }, - "execution_count": 18, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f df . cr cr\n", + "%f df :> ['R'] . cr\n", "df.subtract(df['R'], axis=0)" ] }, @@ -906,20 +1079,18 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, + "execution_count": 39, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Q 3\n", "S 2\n", - "Name: 0, dtype: int64" + "Name: 0, dtype: int32" ] }, - "execution_count": 19, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -931,15 +1102,38 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false - }, + "execution_count": 44, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Q R S T\n", + "0 3 8 2 4\n", + "1 2 6 4 8\n", + "2 6 1 3 8\n", + "\n", + "只針對 Q S 兩欄做 broadcasting, 所以 R T 欄都變成 NaN\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -983,13 +1177,15 @@ "2 3.0 NaN 1.0 NaN" ] }, - "execution_count": 20, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df - halfrow" + "%f df . cr cr\n", + "%f \\ 只針對 Q S 兩欄做 broadcasting, 所以 R T 欄都變成 NaN\n", + "df - halfrow\n" ] }, { @@ -1025,9 +1221,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/03.04-Missing-Values.ipynb b/notebooks/03.04-Missing-Values.ipynb index 6cbef56f9..acc0e853b 100644 --- a/notebooks/03.04-Missing-Values.ipynb +++ b/notebooks/03.04-Missing-Values.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -101,9 +130,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -132,9 +159,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -166,9 +191,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "TypeError", @@ -206,9 +229,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -238,9 +259,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -260,9 +279,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -289,9 +306,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -318,9 +333,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -356,9 +369,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -390,9 +401,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -415,9 +424,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -499,9 +506,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -532,9 +537,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -574,9 +577,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -606,9 +607,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -678,9 +677,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -730,9 +727,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -792,9 +787,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -856,9 +849,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -922,9 +913,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -989,9 +978,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1024,9 +1011,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1058,9 +1043,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1093,9 +1076,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1130,9 +1111,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1193,9 +1172,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1286,9 +1263,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 17cd6be68a555cd50a1ead9f72a261ece2695e52 Mon Sep 17 00:00:00 2001 From: "H.C.Chen" Date: Sun, 8 Jul 2018 05:53:07 +0800 Subject: [PATCH 02/13] bb --- Untitled.ipynb | 63 ++ .../03.02-Data-Indexing-and-Selection.ipynb | 671 ++++++++++++++---- notebooks/03.03-Operations-in-Pandas.ipynb | 109 ++- 3 files changed, 653 insertions(+), 190 deletions(-) create mode 100644 Untitled.ipynb diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 000000000..791c5b373 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/03.02-Data-Indexing-and-Selection.ipynb b/notebooks/03.02-Data-Indexing-and-Selection.ipynb index 7707be226..416af5063 100644 --- a/notebooks/03.02-Data-Indexing-and-Selection.ipynb +++ b/notebooks/03.02-Data-Indexing-and-Selection.ipynb @@ -59,10 +59,37 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { @@ -74,7 +101,7 @@ "dtype: float64" ] }, - "execution_count": 1, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -89,9 +116,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -118,9 +143,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -140,9 +163,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -162,9 +183,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -192,9 +211,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -242,9 +259,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -268,9 +283,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -293,9 +306,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -318,9 +329,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -361,9 +370,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -387,9 +394,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -410,9 +415,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -445,9 +448,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -467,9 +468,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -498,9 +497,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -520,9 +517,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -574,15 +569,26 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, + "execution_count": 8, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -630,15 +636,14 @@ "Texas 695662 26448193" ] }, - "execution_count": 18, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "area = pd.Series({'California': 423967, 'Texas': 695662,\n", - " 'New York': 141297, 'Florida': 170312,\n", - " 'Illinois': 149995})\n", + "area = pd.Series({'New York': 141297, 'Florida': 170312,\n", + " 'Illinois': 149995, 'California': 423967, 'Texas': 695662})\n", "pop = pd.Series({'California': 38332521, 'Texas': 26448193,\n", " 'New York': 19651127, 'Florida': 19552860,\n", " 'Illinois': 12882135})\n", @@ -656,9 +661,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -690,9 +693,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -724,9 +725,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -748,16 +747,14 @@ "metadata": {}, "source": [ "Though this is a useful shorthand, keep in mind that it does not work for all cases!\n", - "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this attribute-style access is not possible.\n", + "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this **attribute-style** access is not possible.\n", "For example, the ``DataFrame`` has a ``pop()`` method, so ``data.pop`` will point to this rather than the ``\"pop\"`` column:" ] }, { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -780,20 +777,31 @@ "source": [ "In particular, you should avoid the temptation to try column assignment via attribute (i.e., use ``data['pop'] = z`` rather than ``data.pop = z``).\n", "\n", - "Like with the ``Series`` objects discussed earlier, this dictionary-style syntax can also be used to modify the object, in this case adding a new column:" + "Like with the ``Series`` objects discussed earlier, this **dictionary-style** syntax can also be used to modify the object, in this case adding a new column:" ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -847,7 +855,7 @@ "Texas 695662 26448193 38.018740" ] }, - "execution_count": 23, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -876,11 +884,22 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, + "execution_count": 21, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 4.23967000e+05 3.83325210e+07 9.04139261e+01]\n", + " [ 1.70312000e+05 1.95528600e+07 1.14806121e+02]\n", + " [ 1.49995000e+05 1.28821350e+07 8.58837628e+01]\n", + " [ 1.41297000e+05 1.96511270e+07 1.39076746e+02]\n", + " [ 6.95662000e+05 2.64481930e+07 3.80187404e+01]]\n", + "data :> values type tib. \\ ==> ()\n", + ".values 就是個簡單的 ndarray\n" + ] + }, { "data": { "text/plain": [ @@ -891,12 +910,14 @@ " [ 6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])" ] }, - "execution_count": 24, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f data :> values . cr\n", + "%f data :> values type tib. \\ .values 就是個簡單的 ndarray\n", "data.values" ] }, @@ -910,15 +931,34 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, + "execution_count": 20, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".T 之後,應該還是個 df\n", + "data :> T type tib. \\ ==> ()\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -966,13 +1006,15 @@ "density 9.041393e+01 1.148061e+02 8.588376e+01 1.390767e+02 3.801874e+01" ] }, - "execution_count": 25, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.T" + "%f \\ .T 之後,應該還是個 df \n", + "%f data :> T type tib.\n", + "data.T " ] }, { @@ -986,9 +1028,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1015,9 +1055,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1052,15 +1090,34 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": false - }, + "execution_count": 24, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data :> iloc[:3,:2] type tib. \\ ==> ()\n", + "這樣擷取的還是個 data frame\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1096,13 +1153,14 @@ "Illinois 149995 12882135" ] }, - "execution_count": 28, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.iloc[:3, :2]" + "%f data :> iloc[:3,:2] type tib. \\ 這樣擷取的還是個 data frame \n", + "data.iloc[:3, :2]\n" ] }, { @@ -1115,9 +1173,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1178,15 +1234,40 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false - }, + "execution_count": 49, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: \n", + ".ix is deprecated. Please use\n", + ".loc for label based indexing or\n", + ".iloc for positional indexing\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1222,7 +1303,7 @@ "Illinois 149995 12882135" ] }, - "execution_count": 30, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -1231,11 +1312,85 @@ "data.ix[:3, :'pop']" ] }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data.ix[:3, :'pop'] 改寫成 data.iloc[:3].loc[:,:'pop'] 就是同樣的意思\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areapop
California42396738332521
Florida17031219552860
Illinois14999512882135
\n", + "
" + ], + "text/plain": [ + " area pop\n", + "California 423967 38332521\n", + "Florida 170312 19552860\n", + "Illinois 149995 12882135" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ data.ix[:3, :'pop'] 改寫成 data.iloc[:3].loc[:,:'pop'] 就是同樣的意思\n", + "data.iloc[:3].loc[:,:'pop']" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects.\n", + "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects. 我有看到警告說 .ix will be deprecated soon 的確,混和的結果又把兩種 style 的模糊性帶回來了!\n", "\n", "Any of the familiar NumPy-style data access patterns can be used within these indexers.\n", "For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:" @@ -1243,15 +1398,35 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 71, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "column indexing 一定要放在逗點之後\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1281,15 +1456,161 @@ "New York 19651127 139.076746" ] }, - "execution_count": 31, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f \\ column indexing 一定要放在逗點之後\n", "data.loc[data.density > 100, ['pop', 'density']]" ] }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "column indexing 一定要放在逗點之後,\n", + ". . . 或者要用 fancy indexing 的形式—放在 list 裡面\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
popdensity
Florida19552860114.806121
New York19651127139.076746
\n", + "
" + ], + "text/plain": [ + " pop density\n", + "Florida 19552860 114.806121\n", + "New York 19651127 139.076746" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ column indexing 一定要放在逗點之後,\n", + "%f \\ . . . 或者要用 fancy indexing 的形式—放在 list 裡面\n", + "data[['pop','density']][data.density > 100]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areapopdensity
Florida17031219552860114.806121
New York14129719651127139.076746
\n", + "
" + ], + "text/plain": [ + " area pop density\n", + "Florida 170312 19552860 114.806121\n", + "New York 141297 19651127 139.076746" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n", + "data.loc[data.density > 100].loc[:'area']" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1300,9 +1621,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1390,15 +1709,26 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false - }, + "execution_count": 25, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1431,15 +1761,68 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 33, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f \\ slicing refers to rows\n", "data['Florida':'Illinois']" ] }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "indexing refers to columns 這句話沒講清楚,\n", + "California 90.413926\n", + "Florida 114.806121\n", + "Illinois 85.883763\n", + "New York 139.076746\n", + "Texas 38.018740\n", + "Name: density, dtype: float64\n", + "以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", + "注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", + "所以作者要強調它,反正就是這樣!\n", + "\n", + "Failed in (compiling=False): 'lalalala'\n", + "Body:\n", + "push(pop()['lalalala'])\n", + "以上當 key 不認得時,讓 python 接著處裡就出錯了!\n" + ] + }, + { + "data": { + "text/plain": [ + "California 90.413926\n", + "Florida 114.806121\n", + "Illinois 85.883763\n", + "New York 139.076746\n", + "Texas 38.018740\n", + "Name: density, dtype: float64" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ indexing refers to columns 這句話沒講清楚,\n", + "%f data :> ['density'] . cr \\ 以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", + "%f \\ 注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", + "%f \\ 所以作者要強調它,反正就是這樣!\n", + "%f data :> ['lalalala'] . cr \n", + "%f \\ 以上當 key 不認得時,讓 python 接著處裡就出錯了!\n", + "data['density']" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1449,15 +1832,28 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 26, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1490,7 +1886,7 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 34, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1508,15 +1904,28 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 39, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1549,7 +1958,7 @@ "New York 141297 19651127 139.076746" ] }, - "execution_count": 35, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1591,9 +2000,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/03.03-Operations-in-Pandas.ipynb b/notebooks/03.03-Operations-in-Pandas.ipynb index ac4b1eb37..2a315c77a 100644 --- a/notebooks/03.03-Operations-in-Pandas.ipynb +++ b/notebooks/03.03-Operations-in-Pandas.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -63,9 +92,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -91,9 +118,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -163,9 +188,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -196,9 +219,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -285,9 +306,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "area = pd.Series({'Alaska': 1723337, 'Texas': 695662,\n", @@ -306,9 +325,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -339,9 +356,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -369,9 +384,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -405,9 +418,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -440,9 +451,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -491,9 +500,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -552,9 +559,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -620,9 +625,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -708,9 +711,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -733,9 +734,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -766,9 +765,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -837,9 +834,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -907,9 +902,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -932,9 +925,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1025,9 +1016,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From f1ece4d265cc9a0c1c2dc3c48828c207b6435ae0 Mon Sep 17 00:00:00 2001 From: "H.C.Chen" Date: Sun, 8 Jul 2018 05:56:16 +0800 Subject: [PATCH 03/13] Revert "bb" This reverts commit 17cd6be68a555cd50a1ead9f72a261ece2695e52. --- Untitled.ipynb | 63 -- .../03.02-Data-Indexing-and-Selection.ipynb | 671 ++++-------------- notebooks/03.03-Operations-in-Pandas.ipynb | 109 +-- 3 files changed, 190 insertions(+), 653 deletions(-) delete mode 100644 Untitled.ipynb diff --git a/Untitled.ipynb b/Untitled.ipynb deleted file mode 100644 index 791c5b373..000000000 --- a/Untitled.ipynb +++ /dev/null @@ -1,63 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reDef unknown\n", - "reDef \\\n" - ] - } - ], - "source": [ - "import peforth\n", - "peforth.dictate(r'''\n", - "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", - ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", - " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", - " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", - " /// here after, when FORTH come accross an unknown token, instead of alerting \n", - " /// it try to find the token in python __main__ module name space.\n", - "\n", - "\\ Redefine \\ command to print the comment line\n", - "code \\ print(nexttoken('\\n')) end-code \n", - "''');\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/03.02-Data-Indexing-and-Selection.ipynb b/notebooks/03.02-Data-Indexing-and-Selection.ipynb index 416af5063..7707be226 100644 --- a/notebooks/03.02-Data-Indexing-and-Selection.ipynb +++ b/notebooks/03.02-Data-Indexing-and-Selection.ipynb @@ -59,37 +59,10 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reDef unknown\n", - "reDef \\\n" - ] - } - ], - "source": [ - "import peforth\n", - "peforth.dictate(r'''\n", - "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", - ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", - " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", - " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", - " /// here after, when FORTH come accross an unknown token, instead of alerting \n", - " /// it try to find the token in python __main__ module name space.\n", - "\n", - "\\ Redefine \\ command to print the comment line\n", - "code \\ print(nexttoken('\\n')) end-code \n", - "''');\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, + "execution_count": 1, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -101,7 +74,7 @@ "dtype: float64" ] }, - "execution_count": 6, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -116,7 +89,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -143,7 +118,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -163,7 +140,9 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -183,7 +162,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -211,7 +192,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -259,7 +242,9 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -283,7 +268,9 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -306,7 +293,9 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -329,7 +318,9 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -370,7 +361,9 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -394,7 +387,9 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -415,7 +410,9 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -448,7 +445,9 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -468,7 +467,9 @@ { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -497,7 +498,9 @@ { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -517,7 +520,9 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -569,26 +574,15 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": 18, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", " \n", @@ -636,14 +630,15 @@ "Texas 695662 26448193" ] }, - "execution_count": 8, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "area = pd.Series({'New York': 141297, 'Florida': 170312,\n", - " 'Illinois': 149995, 'California': 423967, 'Texas': 695662})\n", + "area = pd.Series({'California': 423967, 'Texas': 695662,\n", + " 'New York': 141297, 'Florida': 170312,\n", + " 'Illinois': 149995})\n", "pop = pd.Series({'California': 38332521, 'Texas': 26448193,\n", " 'New York': 19651127, 'Florida': 19552860,\n", " 'Illinois': 12882135})\n", @@ -661,7 +656,9 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -693,7 +690,9 @@ { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -725,7 +724,9 @@ { "cell_type": "code", "execution_count": 21, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -747,14 +748,16 @@ "metadata": {}, "source": [ "Though this is a useful shorthand, keep in mind that it does not work for all cases!\n", - "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this **attribute-style** access is not possible.\n", + "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this attribute-style access is not possible.\n", "For example, the ``DataFrame`` has a ``pop()`` method, so ``data.pop`` will point to this rather than the ``\"pop\"`` column:" ] }, { "cell_type": "code", "execution_count": 22, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -777,31 +780,20 @@ "source": [ "In particular, you should avoid the temptation to try column assignment via attribute (i.e., use ``data['pop'] = z`` rather than ``data.pop = z``).\n", "\n", - "Like with the ``Series`` objects discussed earlier, this **dictionary-style** syntax can also be used to modify the object, in this case adding a new column:" + "Like with the ``Series`` objects discussed earlier, this dictionary-style syntax can also be used to modify the object, in this case adding a new column:" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, + "execution_count": 23, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", " \n", @@ -855,7 +847,7 @@ "Texas 695662 26448193 38.018740" ] }, - "execution_count": 9, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -884,22 +876,11 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, + "execution_count": 24, + "metadata": { + "collapsed": false + }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 4.23967000e+05 3.83325210e+07 9.04139261e+01]\n", - " [ 1.70312000e+05 1.95528600e+07 1.14806121e+02]\n", - " [ 1.49995000e+05 1.28821350e+07 8.58837628e+01]\n", - " [ 1.41297000e+05 1.96511270e+07 1.39076746e+02]\n", - " [ 6.95662000e+05 2.64481930e+07 3.80187404e+01]]\n", - "data :> values type tib. \\ ==> ()\n", - ".values 就是個簡單的 ndarray\n" - ] - }, { "data": { "text/plain": [ @@ -910,14 +891,12 @@ " [ 6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])" ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%f data :> values . cr\n", - "%f data :> values type tib. \\ .values 就是個簡單的 ndarray\n", "data.values" ] }, @@ -931,34 +910,15 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, + "execution_count": 25, + "metadata": { + "collapsed": false + }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".T 之後,應該還是個 df\n", - "data :> T type tib. \\ ==> ()\n" - ] - }, { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", " \n", @@ -1006,15 +966,13 @@ "density 9.041393e+01 1.148061e+02 8.588376e+01 1.390767e+02 3.801874e+01" ] }, - "execution_count": 20, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%f \\ .T 之後,應該還是個 df \n", - "%f data :> T type tib.\n", - "data.T " + "data.T" ] }, { @@ -1028,7 +986,9 @@ { "cell_type": "code", "execution_count": 26, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1055,7 +1015,9 @@ { "cell_type": "code", "execution_count": 27, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1090,34 +1052,15 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, + "execution_count": 28, + "metadata": { + "collapsed": false + }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data :> iloc[:3,:2] type tib. \\ ==> ()\n", - "這樣擷取的還是個 data frame\n" - ] - }, { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", " \n", @@ -1153,14 +1096,13 @@ "Illinois 149995 12882135" ] }, - "execution_count": 24, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%f data :> iloc[:3,:2] type tib. \\ 這樣擷取的還是個 data frame \n", - "data.iloc[:3, :2]\n" + "data.iloc[:3, :2]" ] }, { @@ -1173,7 +1115,9 @@ { "cell_type": "code", "execution_count": 29, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1234,40 +1178,15 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": {}, + "execution_count": 30, + "metadata": { + "collapsed": false + }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: \n", - ".ix is deprecated. Please use\n", - ".loc for label based indexing or\n", - ".iloc for positional indexing\n", - "\n", - "See the documentation here:\n", - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix\n", - " \"\"\"Entry point for launching an IPython kernel.\n" - ] - }, { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", " \n", @@ -1303,7 +1222,7 @@ "Illinois 149995 12882135" ] }, - "execution_count": 49, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1312,85 +1231,11 @@ "data.ix[:3, :'pop']" ] }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data.ix[:3, :'pop'] 改寫成 data.iloc[:3].loc[:,:'pop'] 就是同樣的意思\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
areapop
California42396738332521
Florida17031219552860
Illinois14999512882135
\n", - "
" - ], - "text/plain": [ - " area pop\n", - "California 423967 38332521\n", - "Florida 170312 19552860\n", - "Illinois 149995 12882135" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%f \\ data.ix[:3, :'pop'] 改寫成 data.iloc[:3].loc[:,:'pop'] 就是同樣的意思\n", - "data.iloc[:3].loc[:,:'pop']" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects. 我有看到警告說 .ix will be deprecated soon 的確,混和的結果又把兩種 style 的模糊性帶回來了!\n", + "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects.\n", "\n", "Any of the familiar NumPy-style data access patterns can be used within these indexers.\n", "For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:" @@ -1398,35 +1243,15 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 31, "metadata": { - "scrolled": true + "collapsed": false }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "column indexing 一定要放在逗點之後\n" - ] - }, { "data": { "text/html": [ "
\n", - "\n", "\n", " \n", " \n", @@ -1456,161 +1281,15 @@ "New York 19651127 139.076746" ] }, - "execution_count": 71, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%f \\ column indexing 一定要放在逗點之後\n", "data.loc[data.density > 100, ['pop', 'density']]" ] }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "column indexing 一定要放在逗點之後,\n", - ". . . 或者要用 fancy indexing 的形式—放在 list 裡面\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
popdensity
Florida19552860114.806121
New York19651127139.076746
\n", - "
" - ], - "text/plain": [ - " pop density\n", - "Florida 19552860 114.806121\n", - "New York 19651127 139.076746" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%f \\ column indexing 一定要放在逗點之後,\n", - "%f \\ . . . 或者要用 fancy indexing 的形式—放在 list 裡面\n", - "data[['pop','density']][data.density > 100]" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
areapopdensity
Florida17031219552860114.806121
New York14129719651127139.076746
\n", - "
" - ], - "text/plain": [ - " area pop density\n", - "Florida 170312 19552860 114.806121\n", - "New York 141297 19651127 139.076746" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%f \\ 我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n", - "data.loc[data.density > 100].loc[:'area']" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1621,7 +1300,9 @@ { "cell_type": "code", "execution_count": 32, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1709,26 +1390,15 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, + "execution_count": 33, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { "text/html": [ "
\n", - "\n", "\n", " \n", " \n", @@ -1761,68 +1431,15 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 25, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%f \\ slicing refers to rows\n", "data['Florida':'Illinois']" ] }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "indexing refers to columns 這句話沒講清楚,\n", - "California 90.413926\n", - "Florida 114.806121\n", - "Illinois 85.883763\n", - "New York 139.076746\n", - "Texas 38.018740\n", - "Name: density, dtype: float64\n", - "以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", - "注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", - "所以作者要強調它,反正就是這樣!\n", - "\n", - "Failed in (compiling=False): 'lalalala'\n", - "Body:\n", - "push(pop()['lalalala'])\n", - "以上當 key 不認得時,讓 python 接著處裡就出錯了!\n" - ] - }, - { - "data": { - "text/plain": [ - "California 90.413926\n", - "Florida 114.806121\n", - "Illinois 85.883763\n", - "New York 139.076746\n", - "Texas 38.018740\n", - "Name: density, dtype: float64" - ] - }, - "execution_count": 98, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%f \\ indexing refers to columns 這句話沒講清楚,\n", - "%f data :> ['density'] . cr \\ 以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", - "%f \\ 注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", - "%f \\ 所以作者要強調它,反正就是這樣!\n", - "%f data :> ['lalalala'] . cr \n", - "%f \\ 以上當 key 不認得時,讓 python 接著處裡就出錯了!\n", - "data['density']" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1832,28 +1449,15 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 34, "metadata": { - "scrolled": true + "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", " \n", @@ -1886,7 +1490,7 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 26, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -1904,28 +1508,15 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 35, "metadata": { - "scrolled": true + "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", " \n", @@ -1958,7 +1549,7 @@ "New York 141297 19651127 139.076746" ] }, - "execution_count": 39, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2000,9 +1591,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.5.1" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/notebooks/03.03-Operations-in-Pandas.ipynb b/notebooks/03.03-Operations-in-Pandas.ipynb index 2a315c77a..ac4b1eb37 100644 --- a/notebooks/03.03-Operations-in-Pandas.ipynb +++ b/notebooks/03.03-Operations-in-Pandas.ipynb @@ -1,34 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reDef unknown\n", - "reDef \\\n" - ] - } - ], - "source": [ - "import peforth\n", - "peforth.dictate(r'''\n", - "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", - ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", - " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", - " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", - " /// here after, when FORTH come accross an unknown token, instead of alerting \n", - " /// it try to find the token in python __main__ module name space.\n", - "\n", - "\\ Redefine \\ command to print the comment line\n", - "code \\ print(nexttoken('\\n')) end-code \n", - "''');\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -92,7 +63,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -118,7 +91,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -188,7 +163,9 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -219,7 +196,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -306,7 +285,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "area = pd.Series({'Alaska': 1723337, 'Texas': 695662,\n", @@ -325,7 +306,9 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -356,7 +339,9 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -384,7 +369,9 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -418,7 +405,9 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -451,7 +440,9 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -500,7 +491,9 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -559,7 +552,9 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -625,7 +620,9 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -711,7 +708,9 @@ { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -734,7 +733,9 @@ { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -765,7 +766,9 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -834,7 +837,9 @@ { "cell_type": "code", "execution_count": 18, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -902,7 +907,9 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -925,7 +932,9 @@ { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1016,9 +1025,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.5.1" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } From acf61a6219dfb4fdedb0d140c2ce211bdb6f6608 Mon Sep 17 00:00:00 2001 From: "H.C.Chen" Date: Sun, 8 Jul 2018 06:12:46 +0800 Subject: [PATCH 04/13] cc --- .../03.02-Data-Indexing-and-Selection.ipynb | 671 ++++++++++++++---- notebooks/03.03-Operations-in-Pandas.ipynb | 406 ++++++++--- notebooks/03.04-Missing-Values.ipynb | 137 ++-- 3 files changed, 898 insertions(+), 316 deletions(-) diff --git a/notebooks/03.02-Data-Indexing-and-Selection.ipynb b/notebooks/03.02-Data-Indexing-and-Selection.ipynb index 7707be226..416af5063 100644 --- a/notebooks/03.02-Data-Indexing-and-Selection.ipynb +++ b/notebooks/03.02-Data-Indexing-and-Selection.ipynb @@ -59,10 +59,37 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { @@ -74,7 +101,7 @@ "dtype: float64" ] }, - "execution_count": 1, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -89,9 +116,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -118,9 +143,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -140,9 +163,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -162,9 +183,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -192,9 +211,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -242,9 +259,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -268,9 +283,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -293,9 +306,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -318,9 +329,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -361,9 +370,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -387,9 +394,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -410,9 +415,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -445,9 +448,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -467,9 +468,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -498,9 +497,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -520,9 +517,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -574,15 +569,26 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, + "execution_count": 8, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -630,15 +636,14 @@ "Texas 695662 26448193" ] }, - "execution_count": 18, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "area = pd.Series({'California': 423967, 'Texas': 695662,\n", - " 'New York': 141297, 'Florida': 170312,\n", - " 'Illinois': 149995})\n", + "area = pd.Series({'New York': 141297, 'Florida': 170312,\n", + " 'Illinois': 149995, 'California': 423967, 'Texas': 695662})\n", "pop = pd.Series({'California': 38332521, 'Texas': 26448193,\n", " 'New York': 19651127, 'Florida': 19552860,\n", " 'Illinois': 12882135})\n", @@ -656,9 +661,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -690,9 +693,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -724,9 +725,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -748,16 +747,14 @@ "metadata": {}, "source": [ "Though this is a useful shorthand, keep in mind that it does not work for all cases!\n", - "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this attribute-style access is not possible.\n", + "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this **attribute-style** access is not possible.\n", "For example, the ``DataFrame`` has a ``pop()`` method, so ``data.pop`` will point to this rather than the ``\"pop\"`` column:" ] }, { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -780,20 +777,31 @@ "source": [ "In particular, you should avoid the temptation to try column assignment via attribute (i.e., use ``data['pop'] = z`` rather than ``data.pop = z``).\n", "\n", - "Like with the ``Series`` objects discussed earlier, this dictionary-style syntax can also be used to modify the object, in this case adding a new column:" + "Like with the ``Series`` objects discussed earlier, this **dictionary-style** syntax can also be used to modify the object, in this case adding a new column:" ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -847,7 +855,7 @@ "Texas 695662 26448193 38.018740" ] }, - "execution_count": 23, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -876,11 +884,22 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, + "execution_count": 21, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 4.23967000e+05 3.83325210e+07 9.04139261e+01]\n", + " [ 1.70312000e+05 1.95528600e+07 1.14806121e+02]\n", + " [ 1.49995000e+05 1.28821350e+07 8.58837628e+01]\n", + " [ 1.41297000e+05 1.96511270e+07 1.39076746e+02]\n", + " [ 6.95662000e+05 2.64481930e+07 3.80187404e+01]]\n", + "data :> values type tib. \\ ==> ()\n", + ".values 就是個簡單的 ndarray\n" + ] + }, { "data": { "text/plain": [ @@ -891,12 +910,14 @@ " [ 6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])" ] }, - "execution_count": 24, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f data :> values . cr\n", + "%f data :> values type tib. \\ .values 就是個簡單的 ndarray\n", "data.values" ] }, @@ -910,15 +931,34 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, + "execution_count": 20, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".T 之後,應該還是個 df\n", + "data :> T type tib. \\ ==> ()\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -966,13 +1006,15 @@ "density 9.041393e+01 1.148061e+02 8.588376e+01 1.390767e+02 3.801874e+01" ] }, - "execution_count": 25, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.T" + "%f \\ .T 之後,應該還是個 df \n", + "%f data :> T type tib.\n", + "data.T " ] }, { @@ -986,9 +1028,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1015,9 +1055,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1052,15 +1090,34 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": false - }, + "execution_count": 24, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data :> iloc[:3,:2] type tib. \\ ==> ()\n", + "這樣擷取的還是個 data frame\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1096,13 +1153,14 @@ "Illinois 149995 12882135" ] }, - "execution_count": 28, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.iloc[:3, :2]" + "%f data :> iloc[:3,:2] type tib. \\ 這樣擷取的還是個 data frame \n", + "data.iloc[:3, :2]\n" ] }, { @@ -1115,9 +1173,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1178,15 +1234,40 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false - }, + "execution_count": 49, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: \n", + ".ix is deprecated. Please use\n", + ".loc for label based indexing or\n", + ".iloc for positional indexing\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1222,7 +1303,7 @@ "Illinois 149995 12882135" ] }, - "execution_count": 30, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -1231,11 +1312,85 @@ "data.ix[:3, :'pop']" ] }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data.ix[:3, :'pop'] 改寫成 data.iloc[:3].loc[:,:'pop'] 就是同樣的意思\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areapop
California42396738332521
Florida17031219552860
Illinois14999512882135
\n", + "
" + ], + "text/plain": [ + " area pop\n", + "California 423967 38332521\n", + "Florida 170312 19552860\n", + "Illinois 149995 12882135" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ data.ix[:3, :'pop'] 改寫成 data.iloc[:3].loc[:,:'pop'] 就是同樣的意思\n", + "data.iloc[:3].loc[:,:'pop']" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects.\n", + "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects. 我有看到警告說 .ix will be deprecated soon 的確,混和的結果又把兩種 style 的模糊性帶回來了!\n", "\n", "Any of the familiar NumPy-style data access patterns can be used within these indexers.\n", "For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:" @@ -1243,15 +1398,35 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 71, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "column indexing 一定要放在逗點之後\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1281,15 +1456,161 @@ "New York 19651127 139.076746" ] }, - "execution_count": 31, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f \\ column indexing 一定要放在逗點之後\n", "data.loc[data.density > 100, ['pop', 'density']]" ] }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "column indexing 一定要放在逗點之後,\n", + ". . . 或者要用 fancy indexing 的形式—放在 list 裡面\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
popdensity
Florida19552860114.806121
New York19651127139.076746
\n", + "
" + ], + "text/plain": [ + " pop density\n", + "Florida 19552860 114.806121\n", + "New York 19651127 139.076746" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ column indexing 一定要放在逗點之後,\n", + "%f \\ . . . 或者要用 fancy indexing 的形式—放在 list 裡面\n", + "data[['pop','density']][data.density > 100]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areapopdensity
Florida17031219552860114.806121
New York14129719651127139.076746
\n", + "
" + ], + "text/plain": [ + " area pop density\n", + "Florida 170312 19552860 114.806121\n", + "New York 141297 19651127 139.076746" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n", + "data.loc[data.density > 100].loc[:'area']" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1300,9 +1621,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1390,15 +1709,26 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false - }, + "execution_count": 25, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1431,15 +1761,68 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 33, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f \\ slicing refers to rows\n", "data['Florida':'Illinois']" ] }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "indexing refers to columns 這句話沒講清楚,\n", + "California 90.413926\n", + "Florida 114.806121\n", + "Illinois 85.883763\n", + "New York 139.076746\n", + "Texas 38.018740\n", + "Name: density, dtype: float64\n", + "以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", + "注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", + "所以作者要強調它,反正就是這樣!\n", + "\n", + "Failed in (compiling=False): 'lalalala'\n", + "Body:\n", + "push(pop()['lalalala'])\n", + "以上當 key 不認得時,讓 python 接著處裡就出錯了!\n" + ] + }, + { + "data": { + "text/plain": [ + "California 90.413926\n", + "Florida 114.806121\n", + "Illinois 85.883763\n", + "New York 139.076746\n", + "Texas 38.018740\n", + "Name: density, dtype: float64" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ indexing refers to columns 這句話沒講清楚,\n", + "%f data :> ['density'] . cr \\ 以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", + "%f \\ 注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", + "%f \\ 所以作者要強調它,反正就是這樣!\n", + "%f data :> ['lalalala'] . cr \n", + "%f \\ 以上當 key 不認得時,讓 python 接著處裡就出錯了!\n", + "data['density']" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1449,15 +1832,28 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 26, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1490,7 +1886,7 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 34, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1508,15 +1904,28 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 39, "metadata": { - "collapsed": false + "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1549,7 +1958,7 @@ "New York 141297 19651127 139.076746" ] }, - "execution_count": 35, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1591,9 +2000,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/03.03-Operations-in-Pandas.ipynb b/notebooks/03.03-Operations-in-Pandas.ipynb index ac4b1eb37..6a0806463 100644 --- a/notebooks/03.03-Operations-in-Pandas.ipynb +++ b/notebooks/03.03-Operations-in-Pandas.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -33,7 +62,8 @@ "One of the essential pieces of NumPy is the ability to perform quick element-wise operations, both with basic arithmetic (addition, subtraction, multiplication, etc.) and with more sophisticated operations (trigonometric functions, exponential and logarithmic functions, etc.).\n", "Pandas inherits much of this functionality from NumPy, and the ufuncs that we introduced in [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) are key to this.\n", "\n", - "Pandas includes a couple useful twists, however: for unary operations like negation and trigonometric functions, these ufuncs will *preserve index and column labels* in the output, and for binary operations such as addition and multiplication, Pandas will automatically *align indices* when passing the objects to the ufunc.\n", + "Pandas includes a couple useful twists, however: for unary operations like negation and trigonometric functions, these ufuncs will *preserve index and column labels* in the output, and for binary operations such as addition and multiplication, Pandas will automatically *align indices* when passing the objects to the ufunc. 不知所云,往下看吧。。。\n", + "\n", "This means that keeping the context of data and combining data from different sources–both potentially error-prone tasks with raw NumPy arrays–become essentially foolproof ones with Pandas.\n", "We will additionally see that there are well-defined operations between one-dimensional ``Series`` structures and two-dimensional ``DataFrame`` structures." ] @@ -51,9 +81,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -62,10 +90,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, "outputs": [ { "data": { @@ -74,31 +100,42 @@ "1 3\n", "2 7\n", "3 4\n", - "dtype: int64" + "dtype: int32" ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rng = np.random.RandomState(42)\n", - "ser = pd.Series(rng.randint(0, 10, 4))\n", + "rng = np.random.RandomState(42) named # 42 是 seed, 取得產生器 rng.\n", + "ser = pd.Series(rng.randint(0, 10, 4)) # ser 就是個 fancy array\n", "ser" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -142,7 +179,7 @@ "2 7 2 5 4" ] }, - "execution_count": 3, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -162,11 +199,20 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, + "execution_count": 7, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 6\n", + "1 3\n", + "2 7\n", + "3 4\n", + "dtype: int32\n" + ] + }, { "data": { "text/plain": [ @@ -177,12 +223,13 @@ "dtype: float64" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f ser . cr\n", "np.exp(ser)" ] }, @@ -195,15 +242,36 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, + "execution_count": 10, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " A B C D\n", + "0 6 9 2 6\n", + "1 7 4 3 7\n", + "2 7 2 5 4\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -247,12 +315,13 @@ "2 -0.707107 1.000000e+00 -0.707107 1.224647e-16" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f df . cr\n", "np.sin(df * np.pi / 4)" ] }, @@ -284,10 +353,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, + "execution_count": 21, + "metadata": {}, "outputs": [], "source": [ "area = pd.Series({'Alaska': 1723337, 'Texas': 695662,\n", @@ -305,10 +372,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, + "execution_count": 22, + "metadata": {}, "outputs": [ { "data": { @@ -320,7 +385,7 @@ "dtype: float64" ] }, - "execution_count": 7, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -338,10 +403,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, + "execution_count": 23, + "metadata": {}, "outputs": [ { "data": { @@ -349,7 +412,7 @@ "Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')" ] }, - "execution_count": 8, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -358,6 +421,41 @@ "area.index | population.index" ] }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "我都還不知道 set 運算可以這樣做\n", + "{1, 2, 3, 4, 5}\n", + "peforth 的 OR 就是 python 的 bitwise | 運算\n", + "如過這樣 list(s1) | list(s2) 就是 TypeError: unsupported operand types\n" + ] + }, + { + "data": { + "text/plain": [ + "{1, 2, 3, 4, 5}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 我都還不知道 set 運算可以這樣做\n", + "s1 = {1,2,4}\n", + "s2 = {1,3,5}\n", + "%f s1 s2 OR . cr \\ peforth 的 OR 就是 python 的 bitwise | 運算\n", + "%f \\ 如過這樣 list(s1) | list(s2) 就是 TypeError: unsupported operand types\n", + "s1 | s2" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -368,10 +466,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, + "execution_count": 25, + "metadata": {}, "outputs": [ { "data": { @@ -383,7 +479,7 @@ "dtype: float64" ] }, - "execution_count": 9, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -404,10 +500,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, + "execution_count": 26, + "metadata": {}, "outputs": [ { "data": { @@ -419,7 +513,7 @@ "dtype: float64" ] }, - "execution_count": 10, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -439,15 +533,26 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, + "execution_count": 27, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -477,7 +582,7 @@ "1 5 1" ] }, - "execution_count": 11, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -490,15 +595,26 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, + "execution_count": 28, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -538,7 +654,7 @@ "2 9 2 6" ] }, - "execution_count": 12, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -551,15 +667,26 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, + "execution_count": 29, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -599,7 +726,7 @@ "2 NaN NaN NaN" ] }, - "execution_count": 13, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -619,15 +746,26 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, + "execution_count": 30, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -667,7 +805,7 @@ "2 6.5 13.5 10.5" ] }, - "execution_count": 14, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -707,10 +845,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, + "execution_count": 31, + "metadata": {}, "outputs": [ { "data": { @@ -720,7 +856,7 @@ " [6, 1, 3, 8]])" ] }, - "execution_count": 15, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -732,10 +868,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, + "execution_count": 32, + "metadata": {}, "outputs": [ { "data": { @@ -745,7 +879,7 @@ " [ 3, -7, 1, 4]])" ] }, - "execution_count": 16, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -765,15 +899,26 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, + "execution_count": 33, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -817,7 +962,7 @@ "2 3 -7 1 4" ] }, - "execution_count": 17, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -836,15 +981,41 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, + "execution_count": 38, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Q R S T\n", + "0 3 8 2 4\n", + "1 2 6 4 8\n", + "2 6 1 3 8\n", + "\n", + "0 8\n", + "1 6\n", + "2 1\n", + "Name: R, dtype: int32\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -888,12 +1059,14 @@ "2 5 0 2 7" ] }, - "execution_count": 18, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f df . cr cr\n", + "%f df :> ['R'] . cr\n", "df.subtract(df['R'], axis=0)" ] }, @@ -906,20 +1079,18 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, + "execution_count": 39, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Q 3\n", "S 2\n", - "Name: 0, dtype: int64" + "Name: 0, dtype: int32" ] }, - "execution_count": 19, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -931,15 +1102,38 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false - }, + "execution_count": 44, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Q R S T\n", + "0 3 8 2 4\n", + "1 2 6 4 8\n", + "2 6 1 3 8\n", + "\n", + "只針對 Q S 兩欄做 broadcasting, 所以 R T 欄都變成 NaN\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -983,13 +1177,15 @@ "2 3.0 NaN 1.0 NaN" ] }, - "execution_count": 20, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df - halfrow" + "%f df . cr cr\n", + "%f \\ 只針對 Q S 兩欄做 broadcasting, 所以 R T 欄都變成 NaN\n", + "df - halfrow\n" ] }, { @@ -1025,9 +1221,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/03.04-Missing-Values.ipynb b/notebooks/03.04-Missing-Values.ipynb index 6cbef56f9..acc0e853b 100644 --- a/notebooks/03.04-Missing-Values.ipynb +++ b/notebooks/03.04-Missing-Values.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -101,9 +130,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -132,9 +159,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -166,9 +191,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "TypeError", @@ -206,9 +229,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -238,9 +259,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -260,9 +279,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -289,9 +306,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -318,9 +333,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -356,9 +369,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -390,9 +401,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -415,9 +424,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -499,9 +506,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -532,9 +537,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -574,9 +577,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -606,9 +607,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -678,9 +677,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -730,9 +727,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -792,9 +787,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -856,9 +849,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -922,9 +913,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -989,9 +978,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1024,9 +1011,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1058,9 +1043,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1093,9 +1076,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1130,9 +1111,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1193,9 +1172,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1286,9 +1263,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 2305dbdccf953271a338065d6a785855b917c666 Mon Sep 17 00:00:00 2001 From: "H.C.Chen" Date: Sun, 8 Jul 2018 16:32:30 +0800 Subject: [PATCH 05/13] dd --- notebooks/03.04-Missing-Values.ipynb | 234 +++- notebooks/03.05-Hierarchical-Indexing.ipynb | 1126 ++++++++++--------- 2 files changed, 818 insertions(+), 542 deletions(-) diff --git a/notebooks/03.04-Missing-Values.ipynb b/notebooks/03.04-Missing-Values.ipynb index acc0e853b..1e1cbbf32 100644 --- a/notebooks/03.04-Missing-Values.ipynb +++ b/notebooks/03.04-Missing-Values.ipynb @@ -117,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -394,7 +394,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For types that don't have an available sentinel value, Pandas automatically type-casts when NA values are present.\n", + "For types that don't have an available sentinel value, Pandas automatically type-casts when NA values are present. \n", + "我看過這種情形,還正在奇怪為何該資料無法用 integer? 原來是其中有漏的被整個提升為 float 以便適應有漏洞的地方!\n", "For example, if we set a value in an integer array to ``np.nan``, it will automatically be upcast to a floating-point type to accommodate the NA:" ] }, @@ -451,6 +452,8 @@ "Notice that in addition to casting the integer array to floating point, Pandas automatically converts the ``None`` to a ``NaN`` value.\n", "(Be aware that there is a proposal to add a native integer NA to Pandas in the future; as of this writing, it has not been included).\n", "\n", + "我覺得把各種 integer 提升 upcast 到 int64 就有機會用最小負數來當 NA.\n", + "\n", "While this type of magic may feel a bit hackish compared to the more unified approach to NA values in domain-specific languages like R, the Pandas sentinel/casting approach works quite well in practice and in my experience only rarely causes issues.\n", "\n", "The following table lists the upcasting conventions in Pandas when NA values are introduced:\n", @@ -606,13 +609,26 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -652,7 +668,7 @@ "2 NaN 4.0 6" ] }, - "execution_count": 17, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -676,13 +692,26 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -708,7 +737,7 @@ "1 2.0 3.0 5" ] }, - "execution_count": 18, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -717,6 +746,118 @@ "df.dropna()" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
12.03.05
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "1 2.0 3.0 5" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna(axis='rows')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
12.03.05
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "1 2.0 3.0 5" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna(axis=0)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -726,13 +867,26 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -764,7 +918,7 @@ "2 6" ] }, - "execution_count": 19, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -773,6 +927,68 @@ "df.dropna(axis='columns')" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
2
02
15
26
\n", + "
" + ], + "text/plain": [ + " 2\n", + "0 2\n", + "1 5\n", + "2 6" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna(axis=1)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1263,7 +1479,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.0" + "version": "3.6.1" } }, "nbformat": 4, diff --git a/notebooks/03.05-Hierarchical-Indexing.ipynb b/notebooks/03.05-Hierarchical-Indexing.ipynb index 43e3475c4..4a6298733 100644 --- a/notebooks/03.05-Hierarchical-Indexing.ipynb +++ b/notebooks/03.05-Hierarchical-Indexing.ipynb @@ -1,11 +1,37 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "\n", "\n", @@ -16,10 +42,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "\n", "< [Handling Missing Data](03.04-Missing-Values.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) >" @@ -34,10 +57,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Up to this point we've been focused primarily on one-dimensional and two-dimensional data, stored in Pandas ``Series`` and ``DataFrame`` objects, respectively.\n", "Often it is useful to go beyond this and store higher-dimensional data–that is, data indexed by more than one or two keys.\n", @@ -51,11 +71,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -65,10 +83,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## A Multiply Indexed Series\n", "\n", @@ -78,10 +93,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### The bad way\n", "\n", @@ -91,12 +103,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [ { "data": { @@ -110,7 +118,7 @@ "dtype: int64" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -128,21 +136,16 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "With this indexing scheme, you can straightforwardly index or slice the series based on this multiple index:" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "scrolled": true }, "outputs": [ { @@ -155,7 +158,7 @@ "dtype: int64" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -166,10 +169,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "But the convenience ends there. For example, if you need to select all values from 2010, you'll need to do some messy (and potentially slow) munging to make it happen:" ] @@ -177,11 +177,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -203,20 +199,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This produces the desired result, but is not as clean (or as efficient for large datasets) as the slicing syntax we've grown to love in Pandas." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### The Better Way: Pandas MultiIndex\n", "Fortunately, Pandas provides a better way.\n", @@ -227,11 +217,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -252,10 +238,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Notice that the ``MultiIndex`` contains multiple *levels* of indexing–in this case, the state names and the years, as well as multiple *labels* for each data point which encode these levels.\n", "\n", @@ -265,11 +248,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -295,10 +274,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Here the first two columns of the ``Series`` representation show the multiple index values, while the third column shows the data.\n", "Notice that some entries are missing in the first column: in this multi-index representation, any blank entry indicates the same value as the line above it." @@ -306,10 +282,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now to access all data for which the second index is 2010, we can simply use the Pandas slicing notation:" ] @@ -317,11 +290,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -341,12 +310,55 @@ "pop[:, 2010]" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2000 33871648\n", + "2010 37253956\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pop['California']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "California 2000 33871648\n", + " 2010 37253956\n", + "New York 2000 18976457\n", + " 2010 19378102\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pop['California':'New York']" + ] + }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The result is a singly indexed array with just the keys we're interested in.\n", "This syntax is much more convenient (and the operation is much more efficient!) than the home-spun tuple-based multi-indexing solution that we started with.\n", @@ -355,10 +367,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### MultiIndex as extra dimension\n", "\n", @@ -366,14 +375,28 @@ "In fact, Pandas is built with this equivalence in mind. The ``unstack()`` method will quickly convert a multiply indexed ``Series`` into a conventionally indexed ``DataFrame``:" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "it is a pd Series\n" + ] + } + ], + "source": [ + "%f pop type . cr \\ it is a pd Series" + ] + }, { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -426,10 +449,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Naturally, the ``stack()`` method provides the opposite operation:" ] @@ -437,11 +457,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -466,10 +482,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Seeing this, you might wonder why would we would bother with hierarchical indexing at all.\n", "The reason is simple: just as we were able to use multi-indexing to represent two-dimensional data within a one-dimensional ``Series``, we can also use it to represent data of three or more dimensions in a ``Series`` or ``DataFrame``.\n", @@ -478,17 +491,26 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -530,23 +552,23 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
2010251455616879014687908714
\n", "
" ], "text/plain": [ - " total under18\n", - "California 2000 33871648 9267089\n", - " 2010 37253956 9284094\n", - "New York 2000 18976457 4687374\n", - " 2010 19378102 4318033\n", - "Texas 2000 20851820 5906301\n", - " 2010 25145561 6879014" + " total under18\n", + "California 2000 33871648 9267089\n", + " 2010 37253956 9284094\n", + "New York 2000 18976457 4687374\n", + " 2010 19378102 4318033\n", + "Texas 2000 20851820 5906301\n", + " 2010 25145561 687908714" ] }, - "execution_count": 10, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -555,16 +577,13 @@ "pop_df = pd.DataFrame({'total': pop,\n", " 'under18': [9267089, 9284094,\n", " 4687374, 4318033,\n", - " 5906301, 6879014]})\n", + " 5906301, 687908714]})\n", "pop_df" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "In addition, all the ufuncs and other functionality discussed in [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb) work with hierarchical indices as well.\n", "Here we compute the fraction of people under 18 by year, given the above data:" @@ -573,11 +592,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -630,20 +645,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This allows us to easily and quickly manipulate and explore even high-dimensional data." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Methods of MultiIndex Creation\n", "\n", @@ -653,11 +662,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -721,10 +726,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The work of creating the ``MultiIndex`` is done in the background.\n", "\n", @@ -734,11 +736,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -769,20 +767,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Nevertheless, it is sometimes useful to explicitly create a ``MultiIndex``; we'll see a couple of these methods here." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Explicit MultiIndex constructors\n", "\n", @@ -792,12 +784,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 15, + "metadata": {}, "outputs": [ { "data": { @@ -806,7 +794,7 @@ " labels=[[0, 0, 1, 1], [0, 1, 0, 1]])" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -817,10 +805,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "You can construct it from a list of tuples giving the multiple index values of each point:" ] @@ -828,11 +813,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -852,10 +833,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "You can even construct it from a Cartesian product of single indices:" ] @@ -863,11 +841,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -887,10 +861,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Similarly, you can construct the ``MultiIndex`` directly using its internal encoding by passing ``levels`` (a list of lists containing available index values for each level) and ``labels`` (a list of lists that reference these labels):" ] @@ -898,11 +869,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -923,20 +890,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Any of these objects can be passed as the ``index`` argument when creating a ``Series`` or ``Dataframe``, or be passed to the ``reindex`` method of an existing ``Series`` or ``DataFrame``." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### MultiIndex level names\n", "\n", @@ -947,11 +908,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -978,20 +935,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "With more involved datasets, this can be a useful way to keep track of the meaning of various index values." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### MultiIndex for columns\n", "\n", @@ -1001,17 +952,26 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 25, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1046,40 +1006,40 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
2013131.038.732.036.735.024.037.246.035.859.035.9
244.037.750.035.029.036.733.035.937.036.645.038.1
2014130.037.439.037.861.050.036.932.036.527.038.7
247.037.848.037.351.036.540.037.044.038.135.038.2
\n", @@ -1089,13 +1049,13 @@ "subject Bob Guido Sue \n", "type HR Temp HR Temp HR Temp\n", "year visit \n", - "2013 1 31.0 38.7 32.0 36.7 35.0 37.2\n", - " 2 44.0 37.7 50.0 35.0 29.0 36.7\n", - "2014 1 30.0 37.4 39.0 37.8 61.0 36.9\n", - " 2 47.0 37.8 48.0 37.3 51.0 36.5" + "2013 1 24.0 37.2 46.0 35.8 59.0 35.9\n", + " 2 33.0 35.9 37.0 36.6 45.0 38.1\n", + "2014 1 50.0 36.9 32.0 36.5 27.0 38.7\n", + " 2 40.0 37.0 44.0 38.1 35.0 38.2" ] }, - "execution_count": 19, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1119,10 +1079,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Here we see where the multi-indexing for both rows and columns can come in *very* handy.\n", "This is fundamentally four-dimensional data, where the dimensions are the subject, the measurement type, the year, and the visit number.\n", @@ -1131,17 +1088,28 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 26, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1161,24 +1129,24 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
2013132.036.746.035.8
250.035.037.036.6
2014139.037.832.036.5
248.037.344.038.1
\n", @@ -1187,13 +1155,13 @@ "text/plain": [ "type HR Temp\n", "year visit \n", - "2013 1 32.0 36.7\n", - " 2 50.0 35.0\n", - "2014 1 39.0 37.8\n", - " 2 48.0 37.3" + "2013 1 46.0 35.8\n", + " 2 37.0 36.6\n", + "2014 1 32.0 36.5\n", + " 2 44.0 38.1" ] }, - "execution_count": 20, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1203,21 +1171,94 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 38, "metadata": { - "deletable": true, - "editable": true + "scrolled": true }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---- it's type ----\n", + "\n", + "\n", + "---- see index ----\n", + "MultiIndex(levels=[[2013, 2014], [1, 2]],\n", + " labels=[[0, 0, 1, 1], [0, 1, 0, 1]],\n", + " names=['year', 'visit'])\n", + "\n", + "---- see columns ----\n", + "Index(['HR', 'Temp'], dtype='object', name='type')\n", + "\n", + "---- see it ----\n", + "type HR Temp\n", + "year visit \n", + "2013 1 46.0 35.8\n", + " 2 37.0 36.6\n", + "2014 1 32.0 36.5\n", + " 2 44.0 38.1\n" + ] + } + ], "source": [ - "For complicated records containing multiple labeled measurements across multiple times for many subjects (people, countries, cities, etc.) use of hierarchical rows and columns can be extremely convenient!" + "%f health_data :> ['Guido'] \n", + "%f \\ ---- it's type ---- \n", + "%f dup type . cr cr\n", + "%f \\ ---- see index ----\n", + "%f dup :> index . cr cr\n", + "%f \\ ---- see columns ----\n", + "%f dup :> columns . cr cr\n", + "%f \\ ---- see it ----\n", + "%f . cr dropall" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 41, "metadata": { - "deletable": true, - "editable": true + "scrolled": true }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "既然它是個 df 那麼尾綴 ['HR'] 就是抓出這個 column\n" + ] + }, + { + "data": { + "text/plain": [ + "year visit\n", + "2013 1 46.0\n", + " 2 37.0\n", + "2014 1 32.0\n", + " 2 44.0\n", + "Name: HR, dtype: float64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 既然它是個 df 那麼尾綴 ['HR'] 就是抓出這個 column \n", + "health_data['Guido']['HR']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For complicated records containing multiple labeled measurements across multiple times for many subjects (people, countries, cities, etc.) use of hierarchical rows and columns can be extremely convenient!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ "## Indexing and Slicing a MultiIndex\n", "\n", @@ -1227,10 +1268,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Multiply indexed Series\n", "\n", @@ -1240,11 +1278,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1270,10 +1304,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "We can access single elements by indexing with multiple terms:" ] @@ -1281,11 +1312,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1304,10 +1331,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The ``MultiIndex`` also supports *partial indexing*, or indexing just one of the levels in the index.\n", "The result is another ``Series``, with the lower-level indices maintained:" @@ -1316,11 +1340,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1342,10 +1362,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Partial slicing is available as well, as long as the ``MultiIndex`` is sorted (see discussion in [Sorted and Unsorted Indices](#Sorted-and-unsorted-indices)):" ] @@ -1353,11 +1370,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1381,10 +1394,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "With sorted indices, partial indexing can be performed on lower levels by passing an empty slice in the first index:" ] @@ -1392,11 +1402,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1419,10 +1425,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Other types of indexing and selection (discussed in [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb)) work as well; for example, selection based on Boolean masks:" ] @@ -1430,11 +1433,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1457,10 +1456,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Selection based on fancy indexing also works:" ] @@ -1468,11 +1464,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1496,10 +1488,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Multiply indexed DataFrames\n", "\n", @@ -1510,11 +1499,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1614,10 +1599,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Remember that columns are primary in a ``DataFrame``, and the syntax used for multiply indexed ``Series`` applies to the columns.\n", "For example, we can recover Guido's heart rate data with a simple operation:" @@ -1626,11 +1608,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1654,27 +1632,33 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Also, as with the single-index case, we can use the ``loc``, ``iloc``, and ``ix`` indexers introduced in [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb). For example:" ] }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 44, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1699,13 +1683,13 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
2013131.038.724.037.2
244.037.733.035.9
\n", @@ -1715,11 +1699,11 @@ "subject Bob \n", "type HR Temp\n", "year visit \n", - "2013 1 31.0 38.7\n", - " 2 44.0 37.7" + "2013 1 24.0 37.2\n", + " 2 33.0 35.9" ] }, - "execution_count": 30, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -1728,12 +1712,105 @@ "health_data.iloc[:2, :2]" ] }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "故意搞個意義上有點怪異的\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subjectBobGuido
typeHRTempHR
yearvisit
2013124.037.246.0
233.035.937.0
2014150.036.932.0
\n", + "
" + ], + "text/plain": [ + "subject Bob Guido\n", + "type HR Temp HR\n", + "year visit \n", + "2013 1 24.0 37.2 46.0\n", + " 2 33.0 35.9 37.0\n", + "2014 1 50.0 36.9 32.0" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 故意搞個意義上有點怪異的\n", + "health_data.iloc[:3, :3]" + ] + }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "These indexers provide an array-like view of the underlying two-dimensional data, but each individual index in ``loc`` or ``iloc`` can be passed a tuple of multiple indices. For example:" ] @@ -1742,9 +1819,7 @@ "cell_type": "code", "execution_count": 31, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "scrolled": true }, "outputs": [ { @@ -1768,11 +1843,106 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 49, "metadata": { - "deletable": true, - "editable": true + "scrolled": true }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "如果改成這樣 'HR' 就會是個不認得的人名,\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subjectBob
typeHRTemp
yearvisit
2013124.037.2
233.035.9
2014150.036.9
240.037.0
\n", + "
" + ], + "text/plain": [ + "subject Bob \n", + "type HR Temp\n", + "year visit \n", + "2013 1 24.0 37.2\n", + " 2 33.0 35.9\n", + "2014 1 50.0 36.9\n", + " 2 40.0 37.0" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 如果改成這樣 'HR' 就會是個不認得的人名,\n", + "health_data.loc[:, ['Bob', 'HR or 亂寫,反正就是不認得']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ "Working with slices within these index tuples is not especially convenient; trying to create a slice within a tuple will lead to a syntax error:" ] @@ -1780,11 +1950,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "ename": "SyntaxError", @@ -1801,10 +1967,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "You could get around this by building the desired slice explicitly using Python's built-in ``slice()`` function, but a better way in this context is to use an ``IndexSlice`` object, which Pandas provides for precisely this situation.\n", "For example:" @@ -1813,11 +1976,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1886,20 +2045,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "There are so many ways to interact with data in multiply indexed ``Series`` and ``DataFrame``s, and as with many tools in this book the best way to become familiar with them is to try them out!" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Rearranging Multi-Indices\n", "\n", @@ -1910,10 +2063,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Sorted and unsorted indices\n", "\n", @@ -1927,11 +2077,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -1960,10 +2106,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "If we try to take a partial slice of this index, it will result in an error:" ] @@ -1971,11 +2114,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1996,10 +2135,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Although it is not entirely clear from the error message, this is the result of the MultiIndex not being sorted.\n", "For various reasons, partial slices and other similar operations require the levels in the ``MultiIndex`` to be in sorted (i.e., lexographical) order.\n", @@ -2010,11 +2146,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2041,10 +2173,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "With the index sorted in this way, partial slicing will work as expected:" ] @@ -2052,11 +2181,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2080,10 +2205,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Stacking and unstacking indices\n", "\n", @@ -2093,11 +2215,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2154,11 +2272,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2216,10 +2330,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The opposite of ``unstack()`` is ``stack()``, which here can be used to recover the original series:" ] @@ -2227,11 +2338,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2257,10 +2364,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Index setting and resetting\n", "\n", @@ -2272,11 +2376,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2354,10 +2454,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Often when working with data in the real world, the raw input data looks like this and it's useful to build a ``MultiIndex`` from the column values.\n", "This can be done with the ``set_index`` method of the ``DataFrame``, which returns a multiply indexed ``DataFrame``:" @@ -2366,11 +2463,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2443,20 +2536,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "In practice, I find this type of reindexing to be one of the more useful patterns when encountering real-world datasets." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Data Aggregations on Multi-Indices\n", "\n", @@ -2469,11 +2556,7 @@ { "cell_type": "code", "execution_count": 43, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2573,10 +2656,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Perhaps we'd like to average-out the measurements in the two visits each year. We can do this by naming the index level we'd like to explore, in this case the year:" ] @@ -2584,11 +2664,7 @@ { "cell_type": "code", "execution_count": 44, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2664,10 +2740,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "By further making use of the ``axis`` keyword, we can take the mean among levels on the columns as well:" ] @@ -2675,11 +2748,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -2731,10 +2800,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Thus in two lines, we've been able to find the average heart rate and temperature measured among all subjects in all visits each year.\n", "This syntax is actually a short cut to the ``GroupBy`` functionality, which we will discuss in [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb).\n", @@ -2743,10 +2809,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Aside: Panel Data\n", "\n", @@ -2764,10 +2827,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "\n", "< [Handling Missing Data](03.04-Missing-Values.ipynb) | [Contents](Index.ipynb) | [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb) >" @@ -2791,9 +2851,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From a69c04e33e1b5cc26bc3e5330046f71b8092aa2e Mon Sep 17 00:00:00 2001 From: "H.C. Chen" Date: Sun, 8 Jul 2018 21:12:52 +0800 Subject: [PATCH 06/13] ee --- notebooks/03.05-Hierarchical-Indexing.ipynb | 146 ++++++++----- notebooks/03.06-Concat-And-Append.ipynb | 228 ++++++++++++++------ notebooks/Untitled.ipynb | 140 ++++++++++++ 3 files changed, 394 insertions(+), 120 deletions(-) create mode 100644 notebooks/Untitled.ipynb diff --git a/notebooks/03.05-Hierarchical-Indexing.ipynb b/notebooks/03.05-Hierarchical-Indexing.ipynb index 4a6298733..af0b51211 100644 --- a/notebooks/03.05-Hierarchical-Indexing.ipynb +++ b/notebooks/03.05-Hierarchical-Indexing.ipynb @@ -71,10 +71,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -952,7 +950,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -1006,40 +1004,40 @@ " \n", " 2013\n", " 1\n", - " 24.0\n", - " 37.2\n", - " 46.0\n", - " 35.8\n", - " 59.0\n", - " 35.9\n", + " 31.0\n", + " 37.4\n", + " 19.0\n", + " 37.6\n", + " 31.0\n", + " 36.5\n", " \n", " \n", " 2\n", - " 33.0\n", - " 35.9\n", + " 34.0\n", + " 36.8\n", + " 35.0\n", " 37.0\n", - " 36.6\n", - " 45.0\n", - " 38.1\n", + " 42.0\n", + " 36.9\n", " \n", " \n", " 2014\n", " 1\n", - " 50.0\n", - " 36.9\n", - " 32.0\n", - " 36.5\n", - " 27.0\n", - " 38.7\n", + " 48.0\n", + " 39.1\n", + " 36.0\n", + " 37.6\n", + " 38.0\n", + " 36.3\n", " \n", " \n", " 2\n", - " 40.0\n", - " 37.0\n", - " 44.0\n", - " 38.1\n", - " 35.0\n", - " 38.2\n", + " 49.0\n", + " 36.2\n", + " 51.0\n", + " 35.7\n", + " 43.0\n", + " 36.9\n", " \n", " \n", "\n", @@ -1049,13 +1047,13 @@ "subject Bob Guido Sue \n", "type HR Temp HR Temp HR Temp\n", "year visit \n", - "2013 1 24.0 37.2 46.0 35.8 59.0 35.9\n", - " 2 33.0 35.9 37.0 36.6 45.0 38.1\n", - "2014 1 50.0 36.9 32.0 36.5 27.0 38.7\n", - " 2 40.0 37.0 44.0 38.1 35.0 38.2" + "2013 1 31.0 37.4 19.0 37.6 31.0 36.5\n", + " 2 34.0 36.8 35.0 37.0 42.0 36.9\n", + "2014 1 48.0 39.1 36.0 37.6 38.0 36.3\n", + " 2 49.0 36.2 51.0 35.7 43.0 36.9" ] }, - "execution_count": 25, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -1172,7 +1170,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 5, "metadata": { "scrolled": true }, @@ -1195,10 +1193,10 @@ "---- see it ----\n", "type HR Temp\n", "year visit \n", - "2013 1 46.0 35.8\n", - " 2 37.0 36.6\n", - "2014 1 32.0 36.5\n", - " 2 44.0 38.1\n" + "2013 1 19.0 37.6\n", + " 2 35.0 37.0\n", + "2014 1 36.0 37.6\n", + " 2 51.0 35.7\n" ] } ], @@ -1216,7 +1214,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 6, "metadata": { "scrolled": true }, @@ -1232,14 +1230,14 @@ "data": { "text/plain": [ "year visit\n", - "2013 1 46.0\n", - " 2 37.0\n", - "2014 1 32.0\n", - " 2 44.0\n", + "2013 1 19.0\n", + " 2 35.0\n", + "2014 1 36.0\n", + " 2 51.0\n", "Name: HR, dtype: float64" ] }, - "execution_count": 41, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1944,7 +1942,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Working with slices within these index tuples is not especially convenient; trying to create a slice within a tuple will lead to a syntax error:" + "### 以上為何要用 `.loc[]`? \n", + "為了明示你要用哪一種 indexing : numerical or token. \n", + "那我不用行不行? 結果是不行,不懂為什麼。\n", + "反正都用 `.loc[]` 或 `.iloc[]` 就對了,管他為什麼。反正 `df[]` indexing 本來就很難用。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Working with slices within these index tuples is not especially convenient; trying to create a slice within a tuple will lead to a syntax error:\n", + "\n", + "也就是說─這還用說嘛?─ slicing 必須用 `[方括號]`。" ] }, { @@ -1975,13 +1985,34 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 22, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "既然 .loc[] 都用上了,它是用來解決某些混淆的,那麼\n", + "再加上個 idx[] 也沒啥好說的了,就用吧!\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -2011,15 +2042,15 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
2013131.032.035.019.031.0
2014130.039.061.048.036.038.0
\n", @@ -2029,18 +2060,21 @@ "subject Bob Guido Sue\n", "type HR HR HR\n", "year visit \n", - "2013 1 31.0 32.0 35.0\n", - "2014 1 30.0 39.0 61.0" + "2013 1 31.0 19.0 31.0\n", + "2014 1 48.0 36.0 38.0" ] }, - "execution_count": 33, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f \\ 既然 .loc[] 都用上了,它是用來解決某些混淆的,那麼\n", + "%f \\ 再加上個 idx[] 也沒啥好說的了,就用吧!\n", + "\n", "idx = pd.IndexSlice\n", - "health_data.loc[idx[:, 1], idx[:, 'HR']]" + "health_data.loc[idx[:, 1], idx[:, 'HR']]\n" ] }, { @@ -2851,7 +2885,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.6.0" } }, "nbformat": 4, diff --git a/notebooks/03.06-Concat-And-Append.ipynb b/notebooks/03.06-Concat-And-Append.ipynb index 93e0aa729..a1501bbed 100644 --- a/notebooks/03.06-Concat-And-Append.ipynb +++ b/notebooks/03.06-Concat-And-Append.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -41,10 +70,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, + "execution_count": 2, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -60,15 +87,26 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, + "execution_count": 13, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -108,7 +146,7 @@ "2 A2 B2 C2" ] }, - "execution_count": 2, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -128,15 +166,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In addition, we'll create a quick class that allows us to display multiple ``DataFrame``s side by side. The code makes use of the special ``_repr_html_`` method, which IPython uses to implement its rich object display:" + "In addition, we'll create a quick class that allows us to display multiple ``DataFrame``s side by side. The code makes use of the special ``_repr_html_`` method, which IPython uses to implement its rich object display:\n", + "\n", + "開卷有益啊!我正想搞懂 jupyter notebook 怎麼印出漂亮的 DataFrame 呢!" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "class display(object):\n", @@ -170,6 +208,8 @@ "source": [ "## Recall: Concatenation of NumPy Arrays\n", "\n", + "See my Ynote `\"_matrix_ _array_ _list_ 相加 合併 add merge\"`, python list 本來就可以用 + 的。\n", + "\n", "Concatenation of ``Series`` and ``DataFrame`` objects is very similar to concatenation of Numpy arrays, which can be done via the ``np.concatenate`` function as discussed in [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb).\n", "Recall that with it, you can combine the contents of two or more arrays into a single array:" ] @@ -177,9 +217,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -209,11 +247,16 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, + "execution_count": 4, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "這個比較好玩,類似「併聯」\n" + ] + }, { "data": { "text/plain": [ @@ -221,15 +264,49 @@ " [3, 4, 3, 4]])" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f \\ 這個比較好玩,類似「併聯」\n", "x = [[1, 2],\n", " [3, 4]]\n", - "np.concatenate([x, x], axis=1)" + "np.concatenate([x, x], axis=1) # df.dropna(axis='columns') 好像只有 pandas 才有" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "這個比較平凡,類似「串聯」\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[1, 2],\n", + " [3, 4],\n", + " [1, 2],\n", + " [3, 4]])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 這個比較平凡,類似「串聯」\n", + "x = [[1, 2],\n", + " [3, 4]]\n", + "np.concatenate([x, x], axis=0) # df.dropna(axis='rows') 好像只有 pandas 才有" ] }, { @@ -258,9 +335,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -289,15 +364,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "我猜 pd.concat() 為何要用 list `[ser1,ser2]` 的形式,啊!很簡單,它後面還有很多別的 arguments.\n", + "\n", "It also works to concatenate higher-dimensional objects, such as ``DataFrame``s:" ] }, { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -415,7 +490,7 @@ "source": [ "df1 = make_df('AB', [1, 2])\n", "df2 = make_df('AB', [3, 4])\n", - "display('df1', 'df2', 'pd.concat([df1, df2])')" + "display('df1', 'df2', 'pd.concat([df1, df2])') # default axis=0 or 'rows'" ] }, { @@ -429,9 +504,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 18, "metadata": { - "collapsed": false + "scrolled": false }, "outputs": [ { @@ -439,6 +514,19 @@ "text/html": [ "
\n", "

df3

\n", + "\n", "
\n", " \n", " \n", @@ -464,6 +552,19 @@ " \n", "
\n", "

df4

\n", + "\n", "
\n", " \n", " \n", @@ -488,7 +589,20 @@ "\n", " \n", "
\n", - "

pd.concat([df3, df4], axis='col')

\n", + "

pd.concat([df3, df4], axis='columns')

\n", + "\n", "
\n", " \n", " \n", @@ -530,13 +644,13 @@ "0 C0 D0\n", "1 C1 D1\n", "\n", - "pd.concat([df3, df4], axis='col')\n", + "pd.concat([df3, df4], axis='columns')\n", " A B C D\n", "0 A0 B0 C0 D0\n", "1 A1 B1 C1 D1" ] }, - "execution_count": 8, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -544,14 +658,14 @@ "source": [ "df3 = make_df('AB', [0, 1])\n", "df4 = make_df('CD', [0, 1])\n", - "display('df3', 'df4', \"pd.concat([df3, df4], axis='col')\")" + "display('df3', 'df4', \"pd.concat([df3, df4], axis='columns')\") # axis=1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We could have equivalently specified ``axis=1``; here we've used the more intuitive ``axis='col'``. " + "We could have equivalently specified ``axis=1``; here we've used the more intuitive ``axis='columns'``. (不能用 'col' or 'cols' 可能是 pandas 版本的差別,反正我也不喜歡 col 這類簡寫,除非它都能接受) " ] }, { @@ -561,15 +675,15 @@ "### Duplicate indices\n", "\n", "One important difference between ``np.concatenate`` and ``pd.concat`` is that Pandas concatenation *preserves indices*, even if the result will have duplicate indices!\n", - "Consider this simple example:" + "Consider this simple example:\n", + "\n", + "_here_" ] }, { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -714,9 +828,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -747,9 +859,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -880,9 +990,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1025,9 +1133,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1176,9 +1282,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1315,9 +1419,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1468,9 +1570,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1627,9 +1727,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb new file mode 100644 index 000000000..597ecaa2e --- /dev/null +++ b/notebooks/Untitled.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "?slice\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'a': ['a0', 'a1', 'a2'], 'b': ['b0', 'b1', 'b2'], 'c': ['c0', 'c1', 'c2']}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cols = \"abc\"\n", + "ind = range(3)\n", + "data = {c: [str(c) + str(i) for i in ind]\n", + " for c in cols}\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'a': 11, 'b': 11, 'c': 11}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{c:11 for c in cols}" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " {c for c in \"abc\"} tib. \\ ==> {'b', 'a', 'c'} ()\n", + " [c for c in \"abc\"] tib. \\ ==> ['a', 'b', 'c'] ()\n", + " (c for c in \"abc\") tib. \\ ==> . at 0x000001E8DC926048> ()\n", + " (c for c in \"abc\") py> list(pop()) tib. \\ ==> ['a', 'b', 'c'] ()\n" + ] + } + ], + "source": [ + "%f {c for c in \"abc\"} tib.\n", + "%f [c for c in \"abc\"] tib.\n", + "%f (c for c in \"abc\") tib.\n", + "%f (c for c in \"abc\") py> list(pop()) tib.\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 4c6a719fbe074ee7928a62fb0f49066728ce82fa Mon Sep 17 00:00:00 2001 From: "H.C. Chen" Date: Mon, 9 Jul 2018 19:34:55 +0800 Subject: [PATCH 07/13] 7/9 office --- notebooks/03.06-Concat-And-Append.ipynb | 594 +++++++++++++++++++++--- notebooks/03.07-Merge-and-Join.ipynb | 311 ++++++++----- 2 files changed, 728 insertions(+), 177 deletions(-) diff --git a/notebooks/03.06-Concat-And-Append.ipynb b/notebooks/03.06-Concat-And-Append.ipynb index a1501bbed..80131e944 100644 --- a/notebooks/03.06-Concat-And-Append.ipynb +++ b/notebooks/03.06-Concat-And-Append.ipynb @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -87,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -146,7 +146,7 @@ "2 A2 B2 C2" ] }, - "execution_count": 13, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -173,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -371,7 +371,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -379,6 +379,19 @@ "text/html": [ "
\n", "

df1

\n", + "\n", "
\n", " \n", " \n", @@ -404,6 +417,19 @@ " \n", "
\n", "

df2

\n", + "\n", "
\n", " \n", " \n", @@ -429,6 +455,19 @@ " \n", "
\n", "

pd.concat([df1, df2])

\n", + "\n", "
\n", " \n", " \n", @@ -482,7 +521,7 @@ "4 A4 B4" ] }, - "execution_count": 7, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -675,14 +714,12 @@ "### Duplicate indices\n", "\n", "One important difference between ``np.concatenate`` and ``pd.concat`` is that Pandas concatenation *preserves indices*, even if the result will have duplicate indices!\n", - "Consider this simple example:\n", - "\n", - "_here_" + "Consider this simple example:" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -690,6 +727,19 @@ "text/html": [ "
\n", "

x

\n", + "\n", "
\n", " \n", " \n", @@ -715,6 +765,19 @@ " \n", "
\n", "

y

\n", + "\n", "
\n", " \n", " \n", @@ -740,6 +803,19 @@ " \n", "
\n", "

pd.concat([x, y])

\n", + "\n", "
\n", " \n", " \n", @@ -793,7 +869,7 @@ "1 A3 B3" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -827,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -858,7 +934,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -866,6 +942,19 @@ "text/html": [ "
\n", "

x

\n", + "\n", "
\n", " \n", " \n", @@ -891,6 +980,19 @@ " \n", "
\n", "

y

\n", + "\n", "
\n", " \n", " \n", @@ -916,6 +1018,19 @@ " \n", "
\n", "

pd.concat([x, y], ignore_index=True)

\n", + "\n", "
\n", " \n", " \n", @@ -969,7 +1084,7 @@ "3 A3 B3" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -989,7 +1104,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -997,6 +1112,19 @@ "text/html": [ "
\n", "

x

\n", + "\n", "
\n", " \n", " \n", @@ -1022,6 +1150,19 @@ " \n", "
\n", "

y

\n", + "\n", "
\n", " \n", " \n", @@ -1047,6 +1188,19 @@ " \n", "
\n", "

pd.concat([x, y], keys=['x', 'y'])

\n", + "\n", "
\n", " \n", " \n", @@ -1103,7 +1257,7 @@ " 1 A3 B3" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1127,12 +1281,14 @@ "\n", "In the simple examples we just looked at, we were mainly concatenating ``DataFrame``s with shared column names.\n", "In practice, data from different sources might have different sets of column names, and ``pd.concat`` offers several options in this case.\n", - "Consider the concatenation of the following two ``DataFrame``s, which have some (but not all!) columns in common:" + "Consider the concatenation of the following two ``DataFrame``s, which have some (but not all!) columns in common:\n", + "\n", + "這裡強調的是 column " ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1140,6 +1296,19 @@ "text/html": [ "
\n", "

df5

\n", + "\n", "
\n", " \n", " \n", @@ -1168,6 +1337,19 @@ " \n", "
\n", "

df6

\n", + "\n", "
\n", " \n", " \n", @@ -1196,6 +1378,19 @@ " \n", "
\n", "

pd.concat([df5, df6])

\n", + "\n", "
\n", " \n", " \n", @@ -1259,7 +1454,7 @@ "4 NaN B4 C4 D4" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1270,18 +1465,9 @@ "display('df5', 'df6', 'pd.concat([df5, df6])')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, the entries for which no data is available are filled with NA values.\n", - "To change this, we can specify one of several options for the ``join`` and ``join_axes`` parameters of the concatenate function.\n", - "By default, the join is a union of the input columns (``join='outer'``), but we can change this to an intersection of the columns using ``join='inner'``:" - ] - }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1289,6 +1475,19 @@ "text/html": [ "
\n", "

df5

\n", + "\n", "
\n", " \n", " \n", @@ -1317,6 +1516,19 @@ " \n", "
\n", "

df6

\n", + "\n", "
\n", " \n", " \n", @@ -1328,10 +1540,10 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1344,35 +1556,58 @@ "\n", " \n", "
\n", - "

pd.concat([df5, df6], join='inner')

\n", - "
3B3C3D32B2C2D2
4
\n", + "

pd.concat([df5, df6])

\n", + "\n", + "
\n", " \n", " \n", " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", "
ABCD
1A1B1C1NaN
2A2B2C2NaN
3B3C32NaNB2C2D2
4NaNB4C4D4
\n", @@ -1387,18 +1622,196 @@ "\n", "df6\n", " B C D\n", - "3 B3 C3 D3\n", + "2 B2 C2 D2\n", + "4 B4 C4 D4\n", + "\n", + "pd.concat([df5, df6])\n", + " A B C D\n", + "1 A1 B1 C1 NaN\n", + "2 A2 B2 C2 NaN\n", + "2 NaN B2 C2 D2\n", + "4 NaN B4 C4 D4" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df5 = make_df('ABC', [1, 2])\n", + "df6 = make_df('BCD', [2, 4])\n", + "display('df5', 'df6', 'pd.concat([df5, df6])')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the entries for which no data is available are filled with NA values.\n", + "To change this, we can specify one of several options for the ``join`` and ``join_axes`` parameters of the concatenate function.\n", + "By default, the join is a union of the input columns (``join='outer'``) 我看就是【column 的聯集】, but we can change this to an intersection of the columns using ``join='inner'`` 也就是【column 的交集】了:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "

df5

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
1A1B1C1
2A2B2C2
\n", + "
\n", + "
\n", + "
\n", + "

df6

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BCD
2B2C2D2
4B4C4D4
\n", + "
\n", + "
\n", + "
\n", + "

pd.concat([df5, df6], join='inner')

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
1B1C1
2B2C2
2B2C2
4B4C4
\n", + "
\n", + "
" + ], + "text/plain": [ + "df5\n", + " A B C\n", + "1 A1 B1 C1\n", + "2 A2 B2 C2\n", + "\n", + "df6\n", + " B C D\n", + "2 B2 C2 D2\n", "4 B4 C4 D4\n", "\n", "pd.concat([df5, df6], join='inner')\n", " B C\n", "1 B1 C1\n", "2 B2 C2\n", - "3 B3 C3\n", + "2 B2 C2\n", "4 B4 C4" ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1569,14 +1982,27 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "

df1

\n", + "

df1.append(df2)

\n", + "\n", "\n", " \n", " \n", @@ -1596,21 +2022,6 @@ " \n", " \n", " \n", - " \n", - "
A2B2
\n", - "
\n", - "
\n", - "
\n", - "

df2

\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -1626,7 +2037,20 @@ "\n", " \n", "
\n", - "

df1.append(df2)

\n", + "

df1

\n", + "\n", "
AB
3A3
\n", " \n", " \n", @@ -1646,6 +2070,34 @@ " \n", " \n", " \n", + " \n", + "
A2B2
\n", + "
\n", + "
\n", + "
\n", + "

df2

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1662,31 +2114,41 @@ " " ], "text/plain": [ - "df1\n", + "df1.append(df2)\n", " A B\n", "1 A1 B1\n", "2 A2 B2\n", - "\n", - "df2\n", - " A B\n", "3 A3 B3\n", "4 A4 B4\n", "\n", - "df1.append(df2)\n", + "df1\n", " A B\n", "1 A1 B1\n", "2 A2 B2\n", + "\n", + "df2\n", + " A B\n", "3 A3 B3\n", "4 A4 B4" ] }, - "execution_count": 16, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "注意看! 經過 df1.append(df2) 之後, df1 並沒有被改掉! 照直覺,應該是 append 到 df1 之後,\n", + "結果 pandas 不是這麼想的。所以效率要緊時,還是用 pd.concat 才好。\n" + ] } ], "source": [ - "display('df1', 'df2', 'df1.append(df2)')" + "display('df1.append(df2)','df1', 'df2')\n", + "%f \\ 注意看! 經過 df1.append(df2) 之後, df1 並沒有被改掉! 照直覺,應該是 append 到 df1 之後,\n", + "%f \\ 結果 pandas 不是這麼想的。所以效率要緊時,還是用 pd.concat 才好。\n" ] }, { diff --git a/notebooks/03.07-Merge-and-Join.ipynb b/notebooks/03.07-Merge-and-Join.ipynb index aa91fa060..c5de2c797 100644 --- a/notebooks/03.07-Merge-and-Join.ipynb +++ b/notebooks/03.07-Merge-and-Join.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -39,10 +68,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -102,16 +129,27 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "

df1

\n", + "\n", "
AB
3A3
\n", " \n", " \n", @@ -147,6 +185,19 @@ " \n", "
\n", "

df2

\n", + "\n", "
\n", " \n", " \n", @@ -197,7 +248,7 @@ "3 Sue 2014" ] }, - "execution_count": 2, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -219,15 +270,26 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, + "execution_count": 7, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -274,7 +336,7 @@ "3 Sue HR 2014" ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -312,16 +374,27 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "

df3

\n", + "\n", "
\n", " \n", " \n", @@ -362,6 +435,19 @@ " \n", "
\n", "

df4

\n", + "\n", "
\n", " \n", " \n", @@ -392,6 +478,19 @@ " \n", "
\n", "

pd.merge(df3, df4)

\n", + "\n", "
\n", " \n", " \n", @@ -458,15 +557,24 @@ "3 Sue HR 2014 Steve" ] }, - "execution_count": 4, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "【自動】根據 df4 查出 df3 未知的 supervisor 而得到結果!\n" + ] } ], "source": [ "df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],\n", " 'supervisor': ['Carly', 'Guido', 'Steve']})\n", - "display('df3', 'df4', 'pd.merge(df3, df4)')" + "display('df3', 'df4', 'pd.merge(df3, df4)')\n", + "\n", + "%f \\ 【自動】根據 df4 查出 df3 未知的 supervisor 而得到結果!" ] }, { @@ -496,16 +604,27 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, + "execution_count": 10, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "

df1

\n", + "\n", "
\n", " \n", " \n", @@ -541,6 +660,19 @@ " \n", "
\n", "

df5

\n", + "\n", "
\n", " \n", " \n", @@ -586,6 +718,19 @@ " \n", "
\n", "

pd.merge(df1, df5)

\n", + "\n", "
\n", " \n", " \n", @@ -678,7 +823,7 @@ "7 Sue HR organization" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -727,9 +872,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -897,9 +1040,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1064,9 +1205,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1140,9 +1279,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1255,9 +1392,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1417,9 +1552,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1578,9 +1711,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1754,9 +1885,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1883,9 +2012,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1936,9 +2063,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2079,9 +2204,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2232,9 +2355,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2397,9 +2518,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2576,9 +2695,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Following are shell commands to download the data\n", @@ -2597,9 +2714,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2791,9 +2906,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2886,9 +2999,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2920,9 +3031,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3015,9 +3124,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3045,9 +3152,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3084,9 +3189,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3183,9 +3286,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3218,9 +3319,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3248,9 +3347,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3348,9 +3445,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3448,9 +3543,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "data2010.set_index('state', inplace=True)\n", @@ -3460,9 +3553,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3499,9 +3590,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3560,9 +3649,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From ef3af4be98666a6f702e943cacd5eb0b87540e81 Mon Sep 17 00:00:00 2001 From: "H.C. Chen" Date: Tue, 10 Jul 2018 19:03:54 +0800 Subject: [PATCH 08/13] 7/10 --- notebooks/00.00-Preface.ipynb | 65 ++- .../02.02-The-Basics-Of-NumPy-Arrays.ipynb | 216 +++------ .../02.03-Computation-on-arrays-ufuncs.ipynb | 121 ++--- notebooks/02.07-Fancy-Indexing.ipynb | 434 ++++++++++++++---- notebooks/Untitled.ipynb | 154 ++++++- 5 files changed, 634 insertions(+), 356 deletions(-) diff --git a/notebooks/00.00-Preface.ipynb b/notebooks/00.00-Preface.ipynb index e9e8d99a8..154dd6d48 100644 --- a/notebooks/00.00-Preface.ipynb +++ b/notebooks/00.00-Preface.ipynb @@ -1,5 +1,66 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# README.md of the \"Python Data Science Handbook\"\n", + "\n", + "[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/jakevdp/PythonDataScienceHandbook/master?filepath=notebooks%2FIndex.ipynb)\n", + "\n", + "This repository contains the entire [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do), in the form of (free!) Jupyter notebooks.\n", + "\n", + "![cover image](notebooks/figures/PDSH-cover.png)\n", + "\n", + "## How to Use this Book\n", + "\n", + "- Read the book in its entirety online at https://jakevdp.github.io/PythonDataScienceHandbook/\n", + "\n", + "- Run the code using the Jupyter notebooks available in this repository's [notebooks](notebooks) directory.\n", + "\n", + "- Launch a live notebook server with these notebooks using [binder](https://beta.mybinder.org/): [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/jakevdp/PythonDataScienceHandbook/master?filepath=notebooks%2FIndex.ipynb)\n", + "\n", + "- Buy the printed book through [O'Reilly Media](http://shop.oreilly.com/product/0636920034919.do)\n", + "\n", + "## About\n", + "\n", + "The book was written and tested with Python 3.5, though other Python versions (including Python 2.7) should work in nearly all cases.\n", + "\n", + "The book introduces the core libraries essential for working with data in Python: particularly [IPython](http://ipython.org), [NumPy](http://numpy.org), [Pandas](http://pandas.pydata.org), [Matplotlib](http://matplotlib.org), [Scikit-Learn](http://scikit-learn.org), and related packages.\n", + "Familiarity with Python as a language is assumed; if you need a quick introduction to the language itself, see the free companion project,\n", + "[A Whirlwind Tour of Python](https://github.com/jakevdp/WhirlwindTourOfPython): it's a fast-paced introduction to the Python language aimed at researchers and scientists.\n", + "\n", + "See [Index.ipynb](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb) for an index of the notebooks available to accompany the text.\n", + "\n", + "## Software\n", + "\n", + "The code in the book was tested with Python 3.5, though most (but not all) will also work correctly with Python 2.7 and other older Python versions.\n", + "\n", + "The packages I used to run the code in the book are listed in [requirements.txt](requirements.txt) (Note that some of these exact version numbers may not be available on your platform: you may have to tweak them for your own use).\n", + "To install the requirements using [conda](http://conda.pydata.org), run the following at the command-line:\n", + "\n", + "```\n", + "$ conda install --file requirements.txt\n", + "```\n", + "\n", + "To create a stand-alone environment named ``PDSH`` with Python 3.5 and all the required package versions, run the following:\n", + "\n", + "```\n", + "$ conda create -n PDSH python=3.5 --file requirements.txt\n", + "```\n", + "\n", + "You can read more about using conda environments in the [Managing Environments](http://conda.pydata.org/docs/using/envs.html) section of the conda documentation.\n", + "\n", + "\n", + "## License\n", + "\n", + "### Code\n", + "The code in this repository, including all code samples in the notebooks listed above, is released under the [MIT license](LICENSE-CODE). Read more at the [Open Source Initiative](https://opensource.org/licenses/MIT).\n", + "\n", + "### Text\n", + "The text content of the book is released under the [CC-BY-NC-ND license](LICENSE-TEXT). Read more at [Creative Commons](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode).\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -190,9 +251,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb b/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb index c8bde4cb7..75b1ff517 100644 --- a/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb +++ b/notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb @@ -63,9 +63,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -86,9 +84,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -116,9 +112,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -142,9 +136,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -185,9 +177,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -207,9 +197,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -229,9 +217,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -258,9 +244,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -280,9 +264,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -309,9 +291,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -333,9 +313,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -355,9 +333,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -377,9 +353,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -406,9 +380,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -439,9 +411,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -489,9 +459,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -512,9 +480,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -534,9 +500,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -556,9 +520,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -578,9 +540,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -600,9 +560,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -631,9 +589,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -653,9 +609,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -685,9 +639,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -709,9 +661,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -732,9 +682,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -763,9 +711,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -797,9 +743,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -816,9 +760,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -842,9 +784,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -872,9 +812,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -900,9 +838,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -928,9 +864,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -949,9 +883,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -986,9 +918,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1014,9 +944,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1035,9 +963,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1067,9 +993,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1100,9 +1024,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1125,9 +1047,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1148,9 +1068,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1173,9 +1091,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1224,9 +1140,7 @@ { "cell_type": "code", "execution_count": 43, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1255,9 +1169,7 @@ { "cell_type": "code", "execution_count": 44, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1282,9 +1194,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "grid = np.array([[1, 2, 3],\n", @@ -1294,9 +1204,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1320,9 +1228,7 @@ { "cell_type": "code", "execution_count": 47, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1351,9 +1257,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1380,9 +1284,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1422,9 +1324,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1451,9 +1351,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1477,9 +1375,7 @@ { "cell_type": "code", "execution_count": 52, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1501,9 +1397,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1559,9 +1453,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb b/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb index e4be4920c..eb042911d 100644 --- a/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb +++ b/notebooks/02.03-Computation-on-arrays-ufuncs.ipynb @@ -58,9 +58,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -99,9 +97,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -143,9 +139,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -171,9 +165,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -198,9 +190,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -227,9 +217,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -280,9 +268,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -317,9 +303,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -347,9 +331,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -376,9 +358,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -427,9 +407,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -457,9 +435,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -479,9 +455,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -508,9 +482,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -541,9 +513,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "theta = np.linspace(0, np.pi, 3)" @@ -559,9 +529,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -592,9 +560,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -627,9 +593,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -661,9 +625,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -694,9 +656,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -737,9 +697,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from scipy import special" @@ -748,9 +706,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -773,9 +729,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -828,9 +782,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -857,9 +809,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -899,9 +849,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -929,9 +877,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -958,9 +904,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -980,9 +924,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1020,7 +962,6 @@ "cell_type": "code", "execution_count": 30, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [ @@ -1096,9 +1037,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/02.07-Fancy-Indexing.ipynb b/notebooks/02.07-Fancy-Indexing.ipynb index b4aa99df8..863d4c67d 100644 --- a/notebooks/02.07-Fancy-Indexing.ipynb +++ b/notebooks/02.07-Fancy-Indexing.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -48,10 +77,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, + "execution_count": 3, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -63,7 +90,7 @@ ], "source": [ "import numpy as np\n", - "rand = np.random.RandomState(42)\n", + "rand = np.random.RandomState(42) # 42 is the seed \n", "\n", "x = rand.randint(100, size=10)\n", "print(x)" @@ -79,9 +106,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -107,23 +132,30 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, + "execution_count": 6, + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[71 86 60]\n", + "這就是 fancy indexing 了!\n" + ] + }, { "data": { "text/plain": [ "array([71, 86, 60])" ] }, - "execution_count": 3, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%f x :> [[3,7,4]] . cr \\ 這就是 fancy indexing 了!\n", "ind = [3, 7, 4]\n", "x[ind]" ] @@ -132,15 +164,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When using fancy indexing, the shape of the result reflects the shape of the *index arrays* rather than the shape of the *array being indexed*:" + "When using fancy indexing, the shape of the result reflects the shape of the *index arrays* rather than the shape of the *array being indexed*:\n", + "\n", + "這真是 fancy 了!" ] }, { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -169,10 +201,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, + "execution_count": 49, + "metadata": {}, "outputs": [ { "data": { @@ -182,7 +212,7 @@ " [ 8, 9, 10, 11]])" ] }, - "execution_count": 5, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -201,10 +231,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, + "execution_count": 11, + "metadata": {}, "outputs": [ { "data": { @@ -212,7 +240,7 @@ "array([ 2, 5, 11])" ] }, - "execution_count": 6, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -235,9 +263,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -256,6 +282,50 @@ "X[row[:, np.newaxis], col]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "X(橫,橫) 取的是幾個交叉點; X(豎,橫) 取的是幾條線組成 matrix; \n", + "X(豎,豎) 會怎樣?結果類似 X(橫,橫) 也是取交叉點,不過是豎的排下來。\n", + "X(橫,豎) 會怎樣?結果類似 X(豎,橫) 但【轉置】了。" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2],\n", + " [ 5],\n", + " [11]])" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([[ 2, 6, 10],\n", + " [ 1, 5, 9],\n", + " [ 3, 7, 11]])" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X[row[:, np.newaxis], col[:, np.newaxis]]\n", + "X[row, col[:, np.newaxis]]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -267,9 +337,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -288,6 +356,109 @@ "row[:, np.newaxis] * col" ] }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([2, 1, 3])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([0, 1, 6])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([0, 1, 2])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([0, 1, 6])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([[0],\n", + " [1],\n", + " [2]])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([[0, 0, 0],\n", + " [2, 1, 3],\n", + " [4, 2, 6]])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([[0, 0, 0],\n", + " [2, 1, 3],\n", + " [4, 2, 6]])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row\n", + "col\n", + "row * col\n", + "row.T # 我覺得 row.T 就是橫的轉置成豎的,結果不然。\n", + "row.T * col # 橫的轉置成豎的,只能用 reshape 或 np.newaxis. 不能用 .T [ ]搞不懂何以不然? \n", + "row.reshape(3,1)\n", + "row.reshape(3,1) * col\n", + "row[:,np.newaxis] * col # np.newaxis 似乎好在可以在某個 axis 上加個維度" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -306,10 +477,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, + "execution_count": 66, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -334,10 +503,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, + "execution_count": 67, + "metadata": {}, "outputs": [ { "data": { @@ -345,7 +512,7 @@ "array([10, 8, 9])" ] }, - "execution_count": 10, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -363,10 +530,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, + "execution_count": 68, + "metadata": {}, "outputs": [ { "data": { @@ -375,7 +540,7 @@ " [10, 8, 9]])" ] }, - "execution_count": 11, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } @@ -388,15 +553,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "And we can combine fancy indexing with masking:" + "And we can combine fancy indexing with masking:\n", + "\n", + "上面討論過了的 ndarray `[,]` indexing, 現在只是把 mask 也帶進去" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "%f X type . cr" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, "outputs": [ { "data": { @@ -406,7 +588,7 @@ " [ 8, 10]])" ] }, - "execution_count": 12, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } @@ -416,6 +598,82 @@ "X[row[:, np.newaxis], mask]" ] }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 1, 2, 3],\n", + " [ 4, 5, 6, 7],\n", + " [ 8, 9, 10, 11]])" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([0, 1, 2])" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([ True, False, True, False], dtype=bool)" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([ 0, 10])" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X\n", + "row\n", + "mask\n", + "X[[0,2], mask] # 直接用 row 所表達的是 fancy indexing 的一種,上面我有完整討論過了。" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 4, 5, 6, 7],\n", + " [ 8, 9, 10, 11]])" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X[[1,2]]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -435,10 +693,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, + "execution_count": 42, + "metadata": {}, "outputs": [ { "data": { @@ -446,7 +702,7 @@ "(100, 2)" ] }, - "execution_count": 13, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -468,16 +724,14 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, + "execution_count": 46, + "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAecAAAFVCAYAAADVDycqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X9wlNWh//HPYkz4FciGb7Rk2tnmOg3fP2z1CzJjtQyS\nSgqKF9Q0gIDWOmkro1WQJqJeUCk34/YOMh2Bi3CxGItQGbjBfi+iQoYp6tXUb0OldxBFZMS1GEgW\nSKSEkOf7xyaQxPzY3Zzd5+w+79cMM26ye55zkoyf5/x8fI7jOAIAANYY5HYFAABAV4QzAACWIZwB\nALAM4QwAgGUIZwAALEM4AwBgmQyThbW2tqqiokKff/65MjIytGzZMhUUFJi8BAAAac9oz3nv3r1q\na2vT5s2bNX/+fD377LMmiwcAwBOMhvO3v/1tXbhwQY7j6MyZM7r88stNFg8AgCcYHdYeNmyYjh07\npilTpigcDmvt2rUmiwcAwBOM9px/97vfacKECdq1a5d27NihiooKtbS09Pp+Tg4FAODrjPacR44c\nqYyMSJHZ2dlqbW1VW1tbr+/3+Xyqrz9jsgopJS8vm/bTfrer4Qovt12i/bQ/u9/3GA3ne+65R489\n9pjmzJmj1tZWPfLIIxo8eLDJSwAAkPaMhvPQoUO1cuVKk0UCAOA5HEICAIBlCGcAACxDOAMAYBnC\nGQAAyxDOAABYhnAGAMAyhDMAAJYhnAEAsAzhDACAZQhnAAAsQzgDAGAZwhkAAMsQzgAAWIZwBgDA\nMoQzAACWIZwBALAM4QwAgGUIZwAALEM4AwBgGcIZAADLEM4AAFiGcAYAwDKEMwAAliGcAQCwDOEM\nAIBlCGcAACyTYbrA559/Xnv27NH58+d111136c477zR9CQAA0prRcH7vvff0l7/8RZs3b9ZXX32l\nDRs2mCweAABPMBrO+/btU2FhoebPn6/m5maVl5ebLB4AkCYaGsKqqKjR0aMjFAicUjBYJL8/J+ll\n2MpoODc2NioUCmnt2rX67LPPdP/99+u1114zeQkAQBqoqKhRdfU8ST7V1TmSqrRu3e1JL8NWRsM5\nJydHV111lTIyMlRQUKCsrCw1NDQoNze318/k5WWbrELKof2036u83HaJ9odCfkm+9lc+hUL+mH8m\nJsqwldFwHjdunKqqqvSTn/xEx48f1z/+8Q/5/f4+P1Nff8ZkFVJKXl427af9blfDFV5uu0T78/Ky\nlZ/fIMlRJFwd5ec3xvwzMVGGG6K5gTAazjfddJP+/Oc/q6SkRI7jaOnSpfL5fP1/EADgKcFgkaSq\n9vni0woGJ7lShq18juM4blYgFe5yEoW7Z9rv1fZ7ue0S7af9/fecOYQEAADLEM4AAFiGcAYAwDKE\nMwAAliGcAQCwDOEMAIBlCGcAACxDOAMAYBnCGQAAyxDOAABYhnAGAMAyhDMAAJYhnAEAsAzhDACA\nZQhnAAAsQzgDAGAZwhkAAMsQzgAAWIZwBgDAMoQzAACWIZwBALAM4QwAgGUIZwAALEM4AwBgGcIZ\nAADLEM4AAFgmIeF88uRJ3XTTTTpy5EgiigcAIK0ZD+fW1lYtXbpUgwcPNl00AACeYDycn3nmGc2e\nPVtXXHGF6aIBAPAEo+G8bds2jRo1SjfeeKMcxzFZNAAAnuFzDKbo3Llz5fP5JEkHDx5UQUGB1qxZ\no1GjRpm6BAAAac9oOHc2b948Pf300yooKOjzffX1ZxJx+ZSQl5dN+2m/29VwhZfbLtF+2p/d73sS\ntpWqowcNAABik5Gogl988cVEFQ0AQFrjEBIAACxDOAMAYJmEDWsDACIaGsKqqKjR0aMjFAic0oYN\n0yVd5na1YDHCGQASrKKiRtXV8yT5VFfn6P77N+u556a5XS1YjHAGAEO695CDwSL5/Tk6enSEpI4d\nLD4dOTLczWoiBRDOAGBI9x6yVKV1625XIHCq/bVPkqOCgiZ3KwrrEc4AYEj3HvLeva0qLt6t0aNb\nNHXqv+uLL0YrEDitNWv+WRcuuFlT2I5wBgBDuveQw+HBqqubobo6R9OnV+n1138oScrN9fYJWegf\n4QwAhgSDRZKqdPToCH366UcKh8vav+Nr71UD0WGfMwAY4vfnaN262/X66z/UxIlXSBrZ/h1HgcBp\n49draAirrGy7iot3q6xsmxobw8avAXfQcwaABOjciw4ETisYnGT8Gr0tQEPqI5wBIEq9bZXqSUcv\nOpG6L0Bj6Dx9EM4AECXbeqrdF6DFOnQey80GkotwBoAo2dZTHejQefebjdraStXUzCOgLUA4A0CU\nBtpTNW2gQ+fdbzZCoatVXl7DvLUFCGcAiFIyFnklU/ebDanZ9dEARBDOABClZCzySqZgsEi1tZUK\nha6W1CxpigKBV92uFsQ+ZwDwLL8/RzU18zR9eljXXjtE06e/mvKjAemCnjMAeFi6jQakC8IZAJKk\nY+tSKORXfn4DW5fQK8IZAJKk89alyAIsTvRCz5hzBoAksW2fNOxFOANAkgQCpxTpMUs27JOGvRjW\nBpDyYjmG0q0jKxsawmppOa+cnBfk853U9dcPVzA4LeHXRWoinAGkvFjOvHbrfOyKihrt3HmfOuab\n33nn31RevsfozQFnZacPwhlAyotlLteted/u1w2H/7eqq6fJ5M2BbQ/mQPyMzjm3traqvLxcc+bM\nUWlpqfbs2WOyeADoUSxzuW7N+3a/rtQk0zcHLDhLH0Z7zjt27JDf71cwGNSpU6c0Y8YMFRUVmbwE\nAHxNLGdeu3U+dsd19+5tVTg8WNItMn1zYNuDORA/n+M4Tv9vi87Zs2flOI6GDh2qxsZGlZaW6o03\n3ujzM/X1Z0xdPuXk5WXTftrvdjVcEWvb02kutbExrH/5l336n/9pU0PDUY0aVah/+qdmI21qbAyr\nvLymy42HbQvjJG//7UuR9vfHaM95yJAhkqSmpiY99NBDWrBggcniAXiULXOpHYF2+PBQNTR8qNzc\nb+uqq1pjCja/P0dbtszWjBkv6sCBxQqFfPrgAzNtiuUoTlt+puiZ8QVhX3zxhR544AHNnTtXt9xy\nS7/vj+YOIp3RftrvVbG0PRTyq+tzh/1Rf/7kybDmz9+pI0eGq6DgjNasuUW5ufH1EB944I9dTvgK\nhTbrwIG7lZW1WVu2zI6prIG0yQS3r+/lv/1oGA3nEydO6L777tOSJUt0/fXXR/UZrw9t0H7a70Wx\ntj0/v0GRRVSRUMzPb+z38x293Mgcb5akCaqtHalz5+LvIR46NESdA00aLsmnQ4eGxNSevLzsuNpk\nkpvX9/LfvuTCsPbatWt1+vRprV69WqtWrZLP59P69euVmZlp8jIAPCaeRVxfP8d6s6TZA1rB3H3B\nVWTFdXwLr9xamGbL9dE3owvC4uH1uyfaT/u9KBltLy7erbq6GZ2+8qqkaZo+Pf6ec8eCq08+GaqT\nJw8pNzegq6660OfCq554+Xcv0f6k95wBwBbde7k5OQc1cWLjgHqIXRdc/chENYEeEc4A0tLXh21n\npez2K3gP4QwgLcWyrQiwDeEMwGrRHpYx0EM1bDjoxIY6wA6EMwCrRXtYxkAP1bDhUA4b6gA7GH3w\nBQCYFu3DHAb60AcbHhphQx1gB8IZgNWifYrUQJ825dbTqmyrA+zAsDYAK/Q23xrtYRkDPVTDhkM5\nElUH5rJTD4eQuIiN+LTfq+3vqe1lZdu7nOg1kMNCehNPSJl42EV3yf7dJ+NnGwsv/+1LHEICIIVE\n5ldPSdopabj27v27GhvDRnt48Sy46n4MaMfDLnr7rI29VOayUw9zzgCsEJlv/S9JsyTdpnD4Vyov\nrzF6jXhCqvtnOh520dtnO8K8rm6GqqvvNt6GeDCXnXroOQOwQjBYpL1731A4nLgeXvcjPaMJqVgf\ndmFjL9WG+XTEhnAGYAW/P0cTJ16m6urYwjMW8YRUx2e6PuyiqtfPxnIDkKwhcE5LSz0sCHMRiyJo\nv1fb31vbP/nkqO64Y4caG78pv/+Ytm//ZxUUBGIq2+05344nV3W+Aeh+/Y7227ZQK1m8/LcvsSAM\nQIqprPx/CoUWS/Lp7FlH//qvVVq3LrZw7m3R10BWasfymVh6qTYOgcMOhDOApDt5Mqyysh1fCz0T\nYdVbGQNdqZ2I4zTjmQOHNxDOAJJu/vydPYaeibDqrYz4V2onbnsXC7XQG8IZQNIdORLZjhRxKShN\nhNXixeNUW1vZPm/9mR57bLqkgazU/i9JsyX5FA5PU3m5ud4zC7XQG8IZQNIVFJxRbe3Xg9JEWPU2\nbx3vSu1Eb+8CekI4A0i6NWtu0blzVe1HYh7S4cMBlZVtM7Kyurfh63iCPxnbu4CeEM4Aki43NxKU\nZWXbdeDAYoVCPh04YGbB1Te+Ua/IYSGRMB09+sSAymNeGG4gnAG4xuRWoo5tT2+//YWklyVlSzoj\n6fyA6si8MNzA2doABqyhIayysu0qLt6tsrJtamwMR/U5k2c+d2x7On36/0i6S9Jtku7SO+9kxVwv\nwG30nAEMWLz7gU0MGXf0mF9/XYr0ws+o87B2ODxYdXUzErJPGUgUwhnAgMU7PG1iyPjSjcHLioTy\nLZJeVk7OPySdVDhcFnO9ALcxrA1gwNx8JOGlG4NbJG3WkCE7NH16q959d7ImTrxC0khX6gUMBD1n\nAAPm5ormS4eL5EiapeLiS0PXrLRGqjIazo7j6Mknn9SHH36ozMxMLV++XN/61rdMXgKAhdxc0dxX\nALPSGqnKaDi/+eabamlp0ebNm7V//35VVlZq9erVJi8BAF0QwEhHRuec33//fU2YMEGSdM011+jA\ngQMmiwcAwBOM9pybmpqUnX3pIdIZGRlqa2vToEG93wNE89DpdEb7ab/XnDwZ1syZL+vIkeEqKDij\nNWtuUW5uTpfvz5+/s9fvpwsv/u4783r7+2M0nIcPH67m5uaLr/sLZkmqrz9jsgopJS8vm/bTfrer\nkXRlZTsu7omurXV07lzXvcf9fT8dePV334H2939jYnRYe+zYsdq7d68kqa6uToWFhSaLB5AG+tsT\nbfJIz3jFe+IZYIrRnvPkyZP11ltvadasWZKkyspKk8UDSAPdn6v8jW+cUFnZ9vbV1qc0enRLzM9d\nNi3eE88AU4yGs8/n01NPPWWySAAGdBxx2RGAJh7NGK9gsEhZWZt16NAQBQKn1dJyXtXV96kjCK+8\ncommTv0PffHF/3Jtb7INvXd4G4eQAB6wYMH/1c6dIyRdprq6DLW0/FEbN851pS5+f462bJl9cc6x\nuHi3Ogfh8ePjlZkZ1uuv/9CV+klf791zshiSjXAG0kRfveN33jkj6efqCJt33vk3N6vaRfcglJpd\n76lyshjcRjgDaaLvedJR6tw7jbzuXTKHwYPBItXWVioUulpSs6QpCgReTci1osXBJnAb4Qykib7m\nSb///QvaufNS7/T732/rs6xkLojy+3NUUzNP5eUdNwOvRt1TtWkuHTCJcAbSRF/zpCtXTlFmZudh\n2h/1WVayF0T5/Tl65plJF4O2vHxPVEHLqmqkK8IZSBMmHwDhxoKoeIKWVdVIV4QzkCZMzpN2DvrR\no/+ulpYMFRfvHtDQcech6MLCr7Rs2YQu5cQTtKyqRroinIEUkqw51s5BX1a23cjQcfeecfdjOeMJ\nWhOrqpm3ho0IZyCFuDHHamrouL9y4glaE6MFzFvDRoQzYIloenBuzLGaGjrurxy3ti8xbw0bEc6A\nJaLpwbkxx2rqQI7O5RQWntWyZXYc7MG8NWxEOAOWiKYHl6yTqxIxD9u5Z2zTIwM5DQw2IpyBGCRy\n8VA0PbhkDf1GevG3SXpNdXV+1da+qJqau9NyoRSngcFGhDMQg3gXD/UU6t0fuB4MFuncuf/Qf//3\nIEkn1dIyTI2NYVcCMdJrf03SLEk+hUK3qbychVJAshDOQAziXTzUU6j/53/e3eU9fn+OsrIyFQ5H\n3rdzp6PMTHcCMdKL94uFUoA7BrldASCVBAKnFHlykhTL4qFoQ92WlcPBYJHy8z9QPG0FMHD0nIEY\nxLt4KNoVwbasHI48jOJulZdH31YO8wDMIZyBGMS7eCjaUE/0yuFYAjTWtnKYB2AO4QwkQbRBl+iV\nw4kMUFuG5IF0wJwzMAANDWGVlW1XcfFulZVtU2Nj2O0q9SmRARrvfDyAr6PnDAyA20O5sc7zJnJO\nm8M8AHMIZ2AA3B7KjfXmIJEB6vfn6JlnJl28WSgv38OiMCBOhDMwAG6vro715iCV57QBLyGcgQFw\neyjX7ZuD7tweSQDSBeEMxCkR+3pjLTPWm4NYy7dpThvwEsIZiFNvQ7gDCe1Yh4UTvRfZpjltwEuM\nhXNTU5MWLVqk5uZmnT9/Xo8++qiuvfZaU8UD1ultCHcg866JHhaOtfzu79+7t1XFxbt7vengCU+A\nGcb2Ob/wwgu64YYbVFVVpcrKSj399NOmigas1Nu+3oEEbLR7hePdXx3rXuTu7w+HB6uuboaqq+9W\neXlNVNcEEDtjPed7771XmZmZkqTW1lZlZWWZKhqwUm+PeBzIvGvnYeHRo/+ulpaMHnuq8fbOYx12\n7vz+Tz/9SOFwWft3Lt10cKY2kABOHF555RVn2rRpXf598MEHjuM4zpdffunMmDHDqa2tjadoIKWU\nlm5ypDZHchypzSkt3eScPNnolJZucsaP33HxtamyO4wfv6P965F/48fvMNWkPurz+x7r01c9AcQn\nrp5zSUmJSkpKvvb1Dz/8UIsWLVJFRYWuu+66qMqqrz8TTxXSQl5eNu13qf2menuHDg1R5yHsQ4eG\n6MKFy/Tcc9MuvufChZ7/zvtrf09ld7w/P79BkeHmSO88P78x4T/LZcsm6Ny5S73uZcsmqb7+TJ/1\n7A1/+7Tf6+3vj7Fh7Y8//lgPP/ywVq5cqTFjxpgqFkgIU4dlJHLrUF9lu7EqurfFXmyfAswzFs4r\nVqxQS0uLli9fLsdxNGLECK1atcpU8YBRplZFd4Tk4cOXqaHhqD75pFBlZduMzLv2FcA2rYpm+xRg\nnrFwXr16tamigIQz1dvrCMmysu06cGCxQiGfPvig7554x5B6KORXfn5Dr0FuUwD3JVXqCaQSDiGB\nJ5nu7cXSE+88pB6ZN+45yFkFDXgX4QxPMt3bi6UnHm2Q8xAJwLsIZ8CAWHri0QY5D5EAvItwBgyI\npSfeEeSROefGXoOcVdCAdxHOiItX5kMT0c6OIO9vryeroAHvIpwRF6/Mh7rZTlZBA95l7MEX8Bav\nzId6pZ0A7EI4Iy6xPt0oVdnYznifSAUgdTCsjbh4ZT7UxnZ6ZUoB8DLCGXFJ1nyo2wvPBtrORNSf\noXYg/RHOsFqq9xITUX+2WAHpj3CG1dzsJZro9cZT//6ua+NQOwCzCGdYzc1eoolebzz17++6bLEC\n0h/hDKu52Us00WuPp/7MKQMgnGE1N3uJJnrt8dSfOWUAhDM8JZZ5ZLd67cwpAyCc4SmxzCO71Wtn\nThkAJ4TBU5jPBZAKCGd4io3HcQJAdwxrw1OYzwWQCghneArzuQBSAcPaAABYhp4zrOH2Qy4AwBaE\nM6yR6g+5AABTGNaGNdjmBAARhDOswTYnAIgwPqx9+PBhzZw5U2+//bYyMzNNF480xjYnAIgwGs5N\nTU0KBoPKysoyWSw8gm1OABBhdFh7yZIlWrhwoQYPHmyyWAAAPCWunvPWrVu1cePGLl/Lz8/Xrbfe\nqjFjxshxnF4++XV5ednxVCFt0H7a71VebrtE+73e/v74nFiStA8/+tGPdOWVV8pxHO3fv1/XXHON\nqqqq+v1cff0ZE5dPSXl52bSf9rtdDVd4ue0S7af9/d+YGJtz3rVr18X/Lioq0oYNG0wVDQCApyRk\nK5XP54tpaBsAAFySkBPCdu/enYhiAQDwBA4hAQDAMoQzAACWIZwBALAM4QwAgGUIZwAALEM4AwBg\nGcIZAADLEM4AAFiGcAYAwDKEMwAAliGcAQCwDOEMAIBlCGcAACxDOAMAYBnCGQAAyxDOAABYhnAG\nAMAyhDMAAJYhnAEAsAzhDACAZQhnAAAsQzgDAGCZDLcrgORqaAiroqJGR4+OUCBwSsFgkfz+HLer\nBQDohHD2mIqKGlVXz5PkU12dI6lK69bd7na1AACdMKztMUePjpDka3/la38NALAJ4ewxgcApSU77\nK0eBwGk3qwMA6IGxYe22tjZVVlbqb3/7m1paWvTggw9q4sSJpoqHIcFgkaSq9jnn0woGJ7ldJQBA\nN8bCubq6WhcuXNCmTZt0/Phx7dq1y1TRMMjvz2GOGQAsZyyc9+3bp+985zv6+c9/Lkl64oknTBWd\nVjqvli4s/ErLlk1gtTQAoIu4wnnr1q3auHFjl6/l5uYqKytLa9euVW1trRYvXqyXXnrJSCXTSffV\n0ufOJX61NNunACC1xBXOJSUlKikp6fK1hQsXatKkyPzl+PHj9emnn0ZVVl5edjxVSFmhkF+dV0uH\nQv6E/wweeOCPXW4IsrI2a8uW2Qm9ZrS89vvvzsvt93LbJdrv9fb3x9iw9rhx47R3715NnjxZBw8e\nVH5+flSfq68/Y6oKKSE/v0GR1dI+SY7y8xsT/jM4dGiIOt8QHDo0xIqfe15ethX1cIuX2+/ltku0\nn/b3f2NiLJx//OMf68knn9TMmTMlSU899ZSpotNK59XShYVntWxZ4ldLBwKn2g8cidwQsH0KAOzm\ncxzH6f9tieP1u6dktL+xMazy8pou26dsmHPm7tm77fdy2yXaT/uT2HOGvdg+BQCphRPCAACwDOEM\nAIBlCGcAACxDOAMAYBnCGQAAyxDOAABYhnAGAMAyhDMAAJYhnAEAsAzhDACAZQhnAAAsQzgDAGAZ\nwhkAAMsQzgAAWIZwBgDAMoQzAACWIZwBALAM4QwAgGUIZwAALEM4AwBgGcIZAADLZLhdAcSmoSGs\niooaHT06QoHAKQWDRfL7c9yuFgDAIMI5xVRU1Ki6ep4kn+rqHElVWrfudrerBQAwiGHtFHP06AhJ\nvvZXvvbXAIB0QjinmEDglCSn/ZWjQOC0m9UBACSAsWHtpqYmLViwQF999ZWysrL0m9/8RqNGjTJV\nPNoFg0WSqtrnnE8rGJzkdpUAAIYZC+dt27ZpzJgxWrRokV555RWtX79eFRUVpopHO78/hzlmAEhz\nxoa1CwsL1dTUJCnSi7788stNFQ0AgKfE1XPeunWrNm7c2OVrS5Ys0VtvvaVbb71Vp06d0qZNm4xU\nEAAAr/E5juP0/7b+Pfjgg5owYYJKS0v14Ycf6le/+pV27NhhomgAADzF2JzzyJEjNXz4cElSbm6u\nmpubo/pcff0ZU1VIOXl52bSf9rtdDVd4ue0S7af92f2+x1g4//KXv9QTTzyhTZs2qbW1Vb/+9a9N\nFQ0AgKcYC+crrrhCzz//vKniAADwLA4hAQDAMoQzAACWIZwBALAM4QwAgGUIZwAALEM4AwBgGcIZ\nAADLEM4AAFiGcAYAwDKEMwAAliGcAQCwDOEMAIBlCGcAACxDOAMAYBnCGQAAyxDOAABYhnAGAMAy\nhDMAAJYhnAEAsAzhDACAZQhnAAAsQzgDAGAZwhkAAMsQzgAAWIZwBgDAMoQzAACWGVA4v/HGG3rk\nkUcuvt6/f79KS0t111136bnnnhtw5QAA8KK4w3n58uV69tlnu3xt6dKlWrFihTZt2qS//vWvOnjw\n4IArCACA18QdzmPHjtWTTz558XVTU5POnz+vb37zm5KkH/zgB3r77bcHXEEAALwmo783bN26VRs3\nbuzytcrKSk2dOlXvvffexa81Nzdr+PDhF18PGzZMx44dM1hVAAC8od9wLikpUUlJSb8FDRs2TE1N\nTRdfNzc3a8SIEf1+Li8vu9/3pDPaT/u9ysttl2i/19vfH2OrtYcPH67MzEx99tlnchxH+/bt07hx\n40wVDwCAZ/Tbc47FU089pUWLFqmtrU033nijvve975ksHgAAT/A5juO4XQkAAHAJh5AAAGAZwhkA\nAMsQzgAAWIZwBgDAMlaE8+HDh3XdddeppaXF7aok1dmzZzV//nzNnTtXP/3pT/Xll1+6XaWkampq\n0i9+8QvNmzdPs2bNUl1dndtVSrru59OnO8dxtHTpUs2aNUt33323PvvsM7erlHT79+/XvHnz3K5G\n0rW2tqq8vFxz5sxRaWmp9uzZ43aVkqqtrU2PPfaYZs+erTlz5ujjjz/u8/2uh3NTU5OCwaCysrLc\nrkrS/eEPf9DVV1+tl156SbfddpvWrVvndpWS6oUXXtANN9ygqqoqVVZW6umnn3a7SknV0/n06e7N\nN99US0uLNm/erEceeUSVlZVuVymp1q9fryeeeELnz593uypJt2PHDvn9fv3+97/XunXrtGzZMrer\nlFR79uyRz+fTyy+/rIceekgrVqzo8/1G9znHY8mSJVq4cKHmz5/vdlWS7p577lHHTrZQKKSRI0e6\nXKPkuvfee5WZmSkpclfttRu0sWPHavLkydqyZYvbVUma999/XxMmTJAkXXPNNTpw4IDLNUquQCCg\nVatWqby83O2qJN3UqVM1ZcoUSZFeZEaG6/GTVDfffLOKiookSZ9//nm//79P2k+npzO68/Pzdeut\nt2rMmDFK9+3WvZ1RfvXVV+uee+7RRx99pA0bNrhUu8Trq/319fUqLy/X448/7lLtEiva8+m9oKmp\nSdnZl45tzMjIUFtbmwYNcn0QLykmT56szz//3O1quGLIkCGSIn8DDz30kBYsWOByjZJv0KBBevTR\nR/Xmm2/qt7/9bd9vdlxUXFzszJs3z5k7d67z3e9+15k7d66b1XHV4cOHnZtvvtntaiTdwYMHnWnT\npjl/+tOf3K6KK959911n4cKFblcjaSorK52dO3defD1x4kT3KuOSY8eOOTNnznS7Gq4IhULOHXfc\n4Wzbts3tqrjqxIkTzqRJk5yzZ8/2+h5XxxV27dp18b+LiorSuufYk+eff15XXnmlpk+frqFDh+qy\nyy5zu0pJ9fHHH+vhhx/WypUrNWbMGLergyQYO3asampqNGXKFNXV1amwsNDtKrnCSfORwp6cOHFC\n9913n5YjwwUsAAAAzklEQVQsWaLrr7/e7eokXXV1tY4fP66f/exnysrK0qBBg/ocMbJm0N/n83nu\nD/bOO+9URUWFtm7dKsdxPLc4ZsWKFWppadHy5cvlOI5GjBihVatWuV0tJNDkyZP11ltvadasWZLk\nub/5Dj6fz+0qJN3atWt1+vRprV69WqtWrZLP59P69esvrjtJd8XFxVq8eLHmzp2r1tZWPf744322\nnbO1AQCwjDdWYQAAkEIIZwAALEM4AwBgGcIZAADLEM4AAFiGcAYAwDKEMwAAlvn/5iKbJb8BcnkA\nAAAASUVORK5CYII=\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAD3CAYAAADSftWOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X1wE+e9L/CvJFsrG8m2bIuEhHBSMAr3QkgMHA5JhktwzCGHXmY4w4uJE/cyNG1CyVuTIZ3mEm6bJiSZXmZ6p3NDUlpS2kLr0qSUzJy2EweaEnrJ4cWEkGlwbJoXCIllW7Ylv0jySvcPR4ot765Wq9XLrr+fv2JJu/s8Rvnt49/+nuexxGKxGIiIyFCs+W4AERGlj8GbiMiAGLyJiAyIwZuIyIAYvImIDKgoVxfy+QIZn8PtLoXfP6hDawqX2fto9v4B7KMZFFL/PB6X5OuGGnkXFdny3YSsM3sfzd4/gH00AyP0z1DBm4iIRjF4ExEZEIM3EZEBMXgTERkQgzcRkYxQRESnfxChiJiX45VoLhV86aWXcOTIEUQiEdx1111Yv369nu0iIsobUYziQEsbWtt86OkPobJMQK3Xg4a6Gtisqce8YjSK5iPtmo9XQ1Pwfvvtt9Ha2opf//rXGBoawt69e3VpDBFRIdj72ntoOXUp8XN3fyjxc2O9N+XxzUfaMzpeDU23gLfeegterxdbt27F/fffj9tvv12XxhAR5VsoIuLE+SuS77W2daVMgYQiIlrbfJqPV0vTyNvv9+PTTz/Fiy++iEuXLmHLli3405/+BIvFInuM212qS+G73GwjMzF7H83eP4B9NLIrXQPw9Q5JvucPDMNmL4aneori8T2BkObj1dIUvCsqKjBz5kzY7XbMnDkTgiCgp6cHVVVVssfoMdXU43HpMs2+kJm9j2bvH8A+Gp0YEeGpKEGnf2IAd7scEMMRxb6LERGVLgHd/RMDuJrjk+k6PX7hwoU4duwYYrEYPv/8cwwNDaGiokLLqYiICopQbMOSedMk36v1VkMoVs4gCMU21Ho9mo9XS9PIe/ny5Th58iTWrVuHWCyGHTt2wGYr/LUAiIjU2Lx6LgaHwmht64I/MAy3y4FabzUa6mpUHR//nNbj1bDkag9LPf7EMvOfanFm76PZ+wewj2YQ718oIqIvGEK5U9A0Ys70+HhbpORsSVgiIqMRim2Y6i7N2/FKOMOSiMiAGLyJiAyIwZuIyIAYvImIDIjBm4jIgBi8iYgMiMGbiMiAGLyJiAyIwZuIyIAYvImIDIjBm4jIgBi8iYgMiMGbiMiAGLyJiAyIwZuIyIAYvImIDIjBm4jIgBi8iYgMiMGbiMiAGLyJiAwoo+Dd3d2NZcuWoaOjQ6/2EBGRCpqDdyQSwY4dO+BwOPRsDxERqaA5eD///PPYuHEjpk6dqmd7iIhIhSItB7366quorKzE0qVL8ZOf/ETVMW53KYqKbFouN47H48r4HIXO7H00e/8A9tEMCr1/llgsFkv3oLvvvhsWiwUWiwV///vfcf3112P37t3weDyyx/h8gYwaCoz+MvU4TyEzex/N3j+AfTSDQuqf3E1E08h7//79if9uamrC9773PcXATURE+mKpIBGpEoqI6PQPIhQR890UgsaR91i//OUv9WgHERUoMRpF85F2tLb50NMfQmWZgNtuuharb5kBm5Xjv3zJOHgTkfmEIiL6giGUOwW88mYHWk5dSrzX3R/C4WMXMTgURmO9N4+tnNwYvIkoIXmUXeESMDAUkfxsa1sX1i6bBaE48yoySh+DNxElNB9pHzfK9gdCsp/1B4bRFwxhqrs0F02jJExYERGA0VRJa5tP9efdLgfKnUIWW0RKGLyJCADQFwyhp19+pJ2s1lutOWXCypXMMW1CRACAcqeAyjIB3SkCeIXTjv9WOx2rb5mR9jWkKldqvR401NWwciVN/G0RTVLJo1+h2IZar/JkO7dTwPc3L8Y31tyoKdjGc+rd/SHEMFq50nLqEpqPtGtu92TFkTfRJKM0+m2oqwEAvHXuCobDE4PjwjkeuErtmq6rlFNXU7kyoRLGKeBmbzUa62dPylH75Osx0SSnNPq1Wa1orPfif2+9FbfOuxqVLgFWC1BV5kD9oumJ4K6FUk49XrmSTrv9wRCOnrmMp35+CmI0qrldRsWRN9Ekonb0WyoU497//l/HTdbJtJ5bKaeeqnJFqd2fdAZx4PU2NK2ck1H7jIYjb6JJJN3Rr1Bsw1R3qS4TcZRy6qkqV1JVwrR+0DXpcuAM3kSTSHz0KyUXddsNdTWoXzQdVWWOtNIx5U4BFQpt6wuGU6ZdzIZpE6JJJD76HTuLMk5N3XY8jeIqL9F0/XhOfe2yWWmlY4RiG272VuPomcuS71eWTb4JQwzeRJNMfJTb2tYFf2AYbpcDtd5qxdFvcqWHx12C+bOqNNdnx9Mx6Wisn432S334pDM44b1MJgwZlaaddLTgTjrqmL2PZu8fYJw+pvMw8kBLm+RovX7R9JyuLChGozjwehtaP+hCXzCMyrIvbzx6lgsW0r+hrjvpEFH26VnpIUXt6DfT+mw92axWNK2cgw112f3dGAGDN1GBSXcKebaDvJoKlVyvLKgl7WI2DN5EBSZ5Wdb4JBoA41IUuVonpNwpwO2yoycQnvBecZEVztLijK+R7RuQGTF4ExWQdFIUaoN8poRiG6aUSAfvUCSKQ8f+ofl6XKhKO/52iAqI2kk0qYK8nhNWQhERA8PSu+kAwJkLPs3X02OhqsmKwZsoj5JXyFM7iSbTdUKU2pCsLxiCX2F2Y08gBJ9/UPX1xl43nRsQVxMcT1PaJBKJ4IknnsDly5cRDoexZcsW3HHHHXq3jci0lNIFaibRZLJOCDAaCHv6h9Fy6hOc6+hWTFmoWef7//zuXNrpDrUPQplakaYpeB8+fBgVFRX44Q9/iN7eXqxZs4bBmygNSvlqpUk0oYgIX+8QEIthfo30jEOlCStjA2FyMJbLmSvNykx1rBK1N6Bc5faNRlPwvvPOO7Fy5UoAQCwWg83Gp8NEaql5KJk8hbzIZsFv3vgAx9/9LLHOtlBswfSpUzA0PAJ/IKRqpmRyIFRqw9gbQENdDWKx2Ljrqz1Wjpqp+oVUY15oNAXvKVOmAACCwSAeeughPPLIIymPcbtLUVSU+S9ZbraRmZi9j2bvH6DcxytdA+iR2ZXdHxiGzV4MT/Xo/2PTv3h9z6F38cbp8aPsUCSGS50DWHXr9VizrAbuMgEOu/z/0sPhEZzr6E7Z9uQ2xD3SuAj3h0fwbkcXfvDTtyE1NVvuWDkPbKhFaYkdJ85fQVfvEKorSrBk3jRsXj0XNps1rd+V3gr9e6q5VPDKlSvYunUrGhsbsXr16pSf92t4oJGskKasZovZ+2j2/gGp+yhGRFS65NMFYjgy7vhQRMTxd6QXZAKAE+9ewepb/gmBviiUfrOd/kH4/EMp2y/VhrGmlTvgcZegU+JcqY6Vsua26/Fvi68bV+fd0zMAIP3flV4K6XsqdxPRlO3v6urC5s2bsW3bNqxbty6jhhFNNumua51qLWt/IKSqukSpkiVVG8YSim1YMm+apmOVzim1bngma4Cbnabg/eKLL6K/vx8vvPACmpqa0NTUhOHhYb3bRmRaDXU1WL7gWridAiwK61qHIiLCEREVLvmg63YJqpZDTbXBcDpbnW1ePVdyXe41S2fqXs6ndQ1ws+OqggXG7H00e/8AFWmTpIqPCqcdtbOr0bjCmyh9Sy6PE+xWDIel92mUWtlPbrr5l+f9spJlfk0V6hdOR2WZQ/VINt7H+HWcpcU4dOwfWS3ny+UU+kL6nnJVQaICkVzx0RsM42jrp7DZrIkgnPyZeOC2WS0Qo6PjLYfdhltvvHrcCDRVTbTWzRDkxNMdyUvGZqOcj4tRjcfgTZRDakrfRv9b+jPlU+zY+u/zUFxkhUciR6y2JlrPQMhyvvyYvNOTiHIoPrXb5x9MOatQ6QFlbzCEKSXFmD7VNSEgpgqigcFwIh+t51RzPafqj8Xp8Mo48ibKIqk0hlz+euysQrVT38fmgZWCaHf/ML639yR6gyEIdhuAGIbDUVTpkJvOdKp+Mk6HV4fBmyiLpNIYcsaWvqWaeSgV4ObPqlJcg8T/xQh47AxJubRKOg8HM93UOBmnw6vD4E2UJUppDJsVKCu1o28gLDmtPdUmwVIB7mjrp7huqlPxBiEnnpsuslk0jXq1bGoshflz9Ri8ibJEKY0hRoEbZrixZulXJEe3SlUhSgFuYCiC5Quuxbn2bvgDwyibYkdvcOImCsniuemW05fSGvWOHaHrUcVSiFuuFSoGb6IsUdo+DAA+uNSbMsjFq0LiD+9S5bZ7gyGs/OfrsGF5DfqCIZQIRXjq5ydTjsbdLgdKhCLVo15RjOJAS5vkCD2T4Kp3/tzMGLyJsiA+IvXOcOPEe59LfiY+rV0p2EnmtmuqZW8K8QA3thQw1XKuo5+pxlBoRPWod+9r72UlL613/tzMGLyJdCRGo9hz6F0cf+cyevpDqHAWwwJIrsBnL7alHElK5rbPXIazRPp/XakAt+72mbjwcS8u+4L4Yn4PrBYgGhudah7PTY+IMVWj3lBExInzVySvr0deWq/8udkxeBPpKDnY+oPyez+mopTbDg6NTHjtuqlOyQD3u79cxCedwXGvRWPAtMpS/M//sQilwmgYsFlTV7kAo3lpX6/06oR65KX1ngVqViyaJNKJUrCV/HxYVJzA0hcMpVU5Mjg8ghFx/BhfqU1Xegbxypsd415TswhUuVOAp6JE8px65qXlVhqkURx5E+kk1dKtySrLlANduVOAQ2FBqmRSo95UN4CzbV3YsLwmESDVjHrjS8IePnZxwvmYl84djryJdKJ2vey4VIFOjEYRGVG/6KfUqLfcKaDCaZc9pndAei3wVKNeuSVhmZfOHY68iVRQM+NQzUa9wPiHhEoOvP5BYgVBNaRuBkKxDbWzq3G09VPJYyo1pjniKyAyL50/DN5ECtJdZ6OhrgZ2exH+9P8+hFTcrXDasWPTIrhK5UfDwGi99qm/S5cYAoBQbMUURzF6g6k3Hm5c4UX75f4JDy2BzNMcXKY1fxi8iRSku86GzWrFmmU1+I+/fSh5vv6BMIZCI7LBO36z+Os7lxEW5UfdsRhwU00V6hddl3ITBZvVih2bFuFAywc429aF3oEQKll+Z3gM3kQytK6z4S4TUKVxlmDyzUJOeCQ6YQMHJTarFU3/ekNi5iXTHMbHB5ZEMrSuU+2wF2naNDfdUkNg9CaSznrXLL8zD468iWRkss6GllmC6ZYaAlysaTJj8CaSkck6G1pmCSrdLORwsabJS3PaJBqNYseOHWhoaEBTUxM++ugjPdtFVBDUzDhUMjZNkWpbr/jNQorNapF8nZNiJi/NI++WlhaEw2E0Nzfj7NmzeO6557B7924920aUd1pG0MPhkcTyrXK73siVGyanWyqcAub8kxsb75iFw8c/4mJNlKA5eJ8+fRpLly4FANx88804f/68bo0iKjRq6pnjQfpcRzd8/qFEkI7GYjhy+nLic0rlhko3C62TYtLZ0oyMQ3PwDgaDcDqdiZ9tNhtGRkZQVCR9Sre7FEVFmX9xPB5XxucodGbvo1n7t+fQu5I14Q679Pf+XEc37ltbAodd+v+Z6TLXkXs9mShGsfe193Di/BX4eofgqSjBknnTsHn1XNhsmReamfXfMa7Q+6c5eDudTgwMDCR+jkajsoEbAPz+Qa2XSvB4XPD5Ahmfp5CZvY+56F8oIsLnHwQsFngqSnIy2gxFRBx/57Lke2M3/B2rq3cIHR92Z61S5EBL27ibSad/CIePXcTgUDjjjXz5Pc0duZuI5uC9YMECHD16FKtWrcLZs2fh9XJXZ8ovMRrFr9/4AH9790piJT6H3YbbbrwaG++YrbiBbqa0lPlls1KEG/man+bgvWLFChw/fhwbN25ELBbDzp079WwXUUrJudzmI+3jcsvA6Kj3jdOXYbFYUm6gm0kw01Lml81KEW7ka36ag7fVasVTTz2lZ1uIVJHc13FWFd5p75I9prXNN34D3TQXnEpF7YqCAGCxALfXXitbKaLHDYUb+ZofJ+mQ4Uju6yiz5GlcT9Jmv+kuOKVGPBif6+hGp196mzBgdFGplf983YSbhJ43FG7ka35c24QMRSmXKz2NZVSlSxi3ga5SPjidtULGipf5/d/H67Bk7lWq2jJW/IbS3R9CDF/eUJqPtGtqT6YTjKiwceRNBW9sGkEpl6u0bUGt1zNuA125c/T0a88Hx9vpKi/B17/6X3D+YrfkRsFTSoonjHz1fMA49vfFDRPMi8GbCpZkbrumGm6XHT2B8ITPV7oE3FhThbff+zxRnhevNkneQFcuH2yxAH8++Qka69VXpyS30+Muwdzr3RCKbZLBe3A4glBEHBdI9XjAqJR24cNJ82HwpryTe0Anmds+cxnXTXVKBu8FN3jQWO/FxrrZinXeSvngaAw4euYybFbp6hQpye3s9A8p5rz9Sfl3QJ8HjNnI41PhYvCmvFEaKY6IMdk0wuBwBMtrr8G5jh7JdT6EYhumT1WeHddQVwNRjOLNs59KblemNlWhlO6wWiB5bqlgnOkDRtZ1Tz4M3pQ3SiPF+oXTFdIIIaxcPAMb6mZrzuXarFasXDwDf5GpUlGbqlBKd8jtHSwXjLWsAa6mHazrNicGb0qbHnXIqUaKq2+9PmUaIZPNb8VoFH/+z49hsYyW7sldIxWldEelS8BNs6txrr1bVTDWsoKhmnawrtucGLxJNa11yGODfVyqkeJQaCSrdcrNR9oVa8PVXkMp3RHPwYeWp3ez03JTYl335MPgTaql+0BMKtjfdtO1WH3LDMWRor3YBmdpcUZpBCWp9oqc7pmS1jWS21ldUYL5s6rG5eBzkbLI1u+LChODN6mi5YGYVLAfu6qd3EhxOCzi0LF/oLHem5U65b5gSHENkoGhEYyIMUitmiqVMkpOd8y6vgqBPvlqk2zJJO1CxsMZlqRKujupq5nFuGbpTDjs0l/BsTMdte54LrftWLlTQIXTLntc70BoQn/EaBQHWtqwfc8JfPelE9i+5wQOtLRBjEYTn4m3U2597lzhDvGTA0fepEq6D8TUBvtQOKr4GS3phlS5eaHYhtrZ1bI570qJ/rCGmgoNR96kitLmuHNmVEx4LR7spcSDvZrPaKFmjZDGFV5cN9UpeXzyA75srYVClAkGb1IteaEjh90Gh92K4+c/m5BGUAr28eCo5jNSlHZhVxtobVYrdmxahOULroXbKcCisHBTuikjolxg2oRUG/tA7Jd/voC/nf8s8Z5UGkGq+uG2m67B6ltmJI5Lp0JCTaliOpNVbFYrmv71BmxYXqP4gI811FSIGLxJkwsf+yVfH1t5IlX9MP2ainF7A6ZTIaEm76wl0KYq5VOqoS51FKHIprQYLVF2MG1CaUs3jaBH9YPadEg2UjHA6F8IUjnyTzqDmtfbJsoER96UNmepHYLdmtjkdyy3y4ESoQid/kFVdcZqZ22mkw5Zd/tMXPi4F5d9QURjowtEXetxYt3tM1Vdf35NNeoXTkdlmSPR/hExhsHhiOT1ufAT5QODN6Xt0LGLkoEbGE0jPPXzk6qnz6stwUsnHfK7v1zEJ53BxM/R2OgI+Xd/uTihrE9u2dmjZy6jakz7ufATFRqmTSgtSukLm9WCTzqDqrfxUjrXmQu+cSkModiG+TXVkp8dmw5Jp6wv1TT5se3PVlkjkVaagncgEMD999+Pe+65Bw0NDWhtbdW7XVSglEagoswaqHK10IrbkQVC+NWfL0CMRhOzG9/5YDTQWr94PlhVJkwo7UsnH6/02eT2hyMi5sxwS77PhZ8oHzSlTV5++WUsWbIEmzZtwsWLF/HYY4/h97//vd5toxxId3lXpfSFHLm0QqpzHT//GUoco1/RsamN+D1i/qyqCWmQdNIravvS3T+M/7X3P9EbDH8xnd+CcETkwk+UV5qC96ZNm2C3j64NIYoiBIF/MhqN1uVd4+mLo2cuq76WUomeXAle3JkLPlhkKvHOdfRM2AsynaVR1Vw/rjc4uu1aPNd/67yr0bTyBo64KW9SBu+DBw9i3759417buXMn5s+fD5/Ph23btuGJJ55IeSG3uxRFRZl/0T0e5e2tzCAXfdxz6F3JB4WlJXZ8Y82NisduWHFDWsH7tpuuwfRrvpxCP7Z/D2yoRQwWvHHqE8lj/QH5UbE/MAybvRie6injXn9gQy1KS+w4cf4KunqHUF1RgiXzpmHz6rmwJS0VOPazSvtOJmu/3IfqaqfsIlT8nhpfoffPEotJ7SOS2oULF/Doo4/i8ccfx7Jly1J+fuzEDK08Hpcu5ylkuehjKCJi+54TkumCqjIHnv7GvyiOKJWOTz5XPK1gs1oRioiw2YshhiMT1g6RO1+la3Tqupa2ppMSCkVE9PQPo+XUJ4m9McunCPDLTH23WoCd31wiWWHC76nxFVL/5G4imtIm7e3tePjhh/GjH/0Ic+bMyahhlHuZlr2pSTe4nQJ2bFoEV6k98cCxtc2HnkAIlS6JVf5kzjelpBizryvHkdMTR/qpHhSmswmCUGzDtKopaFo5JxH0S4TRskdOi6dCpKnaZNeuXQiHw3jmmWfQ1NSELVu26N0uyiI9yt4a6mpw67yrZd/vGwhhKDQCIGmVv5h0CaHSDEYLMG5BLLkFpPQSD/quUrum2ZpEuaBp5L17926920E5pMd+hzarFU0rb8CFj/2KI1O1O/AozWA8+0E3nv7Gv+RlhxhuLUaFijMss0CP3dWzLZ2gJNcfNTeBTv+gqhSN2lROrmcxcmsxKlQM3jrSWn6XD2qCkpr+pLoJqK27LvRlV3O1iTCRWgzeOjLiVllKQUlNf1LdBNSmaFJ9DoDqxa6IJgMGb51o2V29kKXbH6WbgNoUjdTnbp5dhWgshu17ThT8XzNEucTgrZNcrTqXq3y6Xv2Jt3ftsllYu2yWZJ13nNQo/pU3O/CGwf6aIcoFBm+dZDtnm+t8eqb9kWvvAxtq0dMjvZxsXHwUb7a/Zoj0xL87daJ1Bxe11OyIrqdM+yPX3r2vvSd7TPJuNtz4l0geR946ylZNcD5GoGI0imgsBseYHXMcdhtuu/HqlP1Rau+J81fwb4uvG9deuVH6mqUzC7oChSifGLx1lK2a4Hzs4tJ8pH3ClPThsAiLxZIyTaPU3q7eoQntVapqyXQyEZFZMW2SBXpsuDtWNnZxUdpwN53daNJtb3VFybj2prrWmqVfyenUeCKj4MjbAPSYzh6n5sFnNheuWjJv2rj2prpWcDDCGY5EEhi8DUKvfLqaiTd6VM7ItXfz6rno6RlIfE7ttTjDkWg8Bm+D0COfrvbBp14LV0m1N3kzBD3/qiCaTBi8DSaTEWg66RC9Rvpq2suV+4jSx+A9iaSTDsnlanpcuY8ofaw2KXBKVSHp0jLxRu/KGSW5vBaR0XHkXaCyNR2eKQoic2DwLlDZWl6WKQoic2DapABlOklGDaYoiIyNwbsAcUEmIkqFwbsAZWM6PBGZS0bBu6OjAwsXLkQoxJGgnrK9vCwRGZ/mB5bBYBDPP/887Ha7nu2hL7AqhIiUaAresVgMTz75JB599FF861vf0rtNBFaFEJGylMH74MGD2Ldv37jXrrnmGqxatQpz5sxRfSG3uxRFRZkHH4/HlfE5Cl1yH6fnqR3ZMhn/Dc3I7H0s9P5ZYrFYLN2DVqxYgauvvhoAcPbsWcyfPx/79+9XPMbnC2hr4Rgej0uX8xQys/fR7P0D2EczKKT+yd1ENKVNXn/99cR/19XVYe/evdpaRUREmrBUkIjIgDKeHn/kyBE92kFERGngyJuIyIAYvImIDIjBm4jIgBi8iYgMiMGbiMiAGLyJiAyIwZuIyIAYvImIDIjBm4jIgBi8iYgMiMGbiMiAGLyJiAyIwZuIyIAYvImIDIjBW0IoIqLTP4hQRMx3U4iIJGW8nreZiNEomo+0o7XNh57+ECrLBNR6PWioq4HNyvscERUOBu8xmo+0o+XUpcTP3f2hxM+N9d58NYuIaAIOJ78QiohobfNJvtfa1sUUChEVFAbvL/QFQ+jpD0m+5w8Moy8o/R4RUT4weH+h3CmgskyQfM/tcqDcKf0eEVE+GCJ4x6s/hsMjWbuGUGxDrdcj+V6ttxpCsS1r1yYiSpemB5aiKOLZZ5/F+fPnEQ6H8eCDD2L58uV6t21C9YfHXYL5s6qyVv2xZulXMDg8gvc/8qM3GILb5UCttxoNdTW6X4uIKBOagvcf/vAHjIyM4De/+Q0+//xz/PGPf9S7XQAmVn90+oeyUv0hVSJ4y9yrcdcKL0oFFuQQUeHRNHx96623cNVVV+Gb3/wmtm/fjrq6Or3bldPqj/hNors/hBhGSwSPn/8Mh45d1O0aRER6SjmsPHjwIPbt2zfuNbfbDUEQ8NJLL+HkyZP47ne/i/379yuex+0uRVGR+rzxla4B9ATkqz9s9mJ4qqeoPp+c4fAIznV0S753rqMb960tgcOe29G3x+PK6fVyzez9A9hHMyj0/qWMSuvXr8f69evHvfbtb38bt99+OywWCxYvXowPP/ww5YX8/sG0GiZGRFS6BHRLlO+5XQ6I4Qh8vkBa55TS6R+Ezz8k+V5X7xA6PuzGVHdpxtdRy+Nx6dKvQmX2/gHsoxkUUv/kbiKa0iYLFy7Em2++CQB4//33MW3aNO0tk5Gr6g+WCBKREWkK3hs2bEAsFsOGDRvw5JNP4vvf/77e7QIANNTVoH7RdFSVOWC1AFPdJahfNF3X6g+WCBKREVlisVgsFxfK5E+QUEREXzCEWddXIdAnneLIxJfVJl3wB4bHlQjmekGqQvpzLRvM3j+AfTSDQuqfXNrEEHVwQrENU92lcNiLkI1fp81qRWO9F2uXzUJfMIRyp8ARNxEVNEME71yJ3ySIiAqdIabHExHReAzeREQGxOBNRGRADN5ERAbE4E1EZEAM3kREBsTgTURkQAzeREQGxOBNRGRADN5ERAbE4E1EZEAM3kREBsTgTURkQKYO3qGIiE7/oK6bFRMRFQJTLgn75ea2G1YjAAAE7UlEQVQKPvT0h1BZJqDW68nL5gpERNlgyuDdfKQdLacuJX7u7g8lfm6s9+arWUREujHdMDQUEdHa5pN8r7WtiykUIjIF0wXvvmAIPf0hyff8gWH0BaXfIyIyEtMF73KngMoyQfI9t8uBcqf0e0RERqIpeAcCAdx7771obGzEpk2b4PNJpynyQSi2odbrkXyv1lvNjYWJyBQ0Be9XX30VXq8XBw4cwKpVq/Czn/1M73ZlpKGuBvWLpqOqzAGrBagqc6B+0XQ01NXku2lERLrQVG3i9Xpx8eJFAEAwGERRUerTuN2lKCrKfNTr8bhUfe7huxZiODwCf38I7jIBDrtxCmvU9tGozN4/gH00g0LvX8qIdvDgQezbt2/cazt27MDx48exatUq9PX1Yf/+/Skv5PcPam/lFzweF3y+QFrHFAEI9A0hvaPyR0sfjcTs/QPYRzMopP7J3URSBu/169dj/fr141574IEHcO+992Ljxo14//338eCDD+K1117Tp6VERJSSppx3WVkZXK7Ru0FVVRUGBgZ0bRQRESnTlAh++OGHsX37dhw4cAAjIyP4wQ9+oHe7iIhIgabgfdVVV2HPnj16t4WIiFSyxGKxWL4bQURE6THdDEsiosmAwZuIyIAYvImIDIjBm4jIgBi8iYgMiMGbiMiAGLyJiAzIUMF7cHAQW7Zswd13342vf/3r6OnpyXeTdBUIBHD//ffjnnvuQUNDA1pbW/PdpKx5/fXX8dhjj+W7GbqKRqPYsWMHGhoa0NTUhI8++ijfTcqKd955B01NTfluRlZEIhFs27YNjY2NWLduHd544418N0mWoYL3b3/7W8ydOxf79+/HV7/6Vbzwwgv5bpKuXn75ZSxZsgS/+tWv8Oyzz+Kpp57Kd5Oy4umnn8auXbsQjUbz3RRdtbS0IBwOo7m5GY899hiee+65fDdJd3v27MH27dsRCplzO8HDhw+joqICBw4cwE9/+tOCXvrDOItcA9i0aRNEcXQD4U8//RTV1dV5bpG+Nm3aBLvdDgAQRRGCYM4t2xYsWID6+no0Nzfnuym6On36NJYuXQoAuPnmm3H+/Pk8t0h/M2bMwI9//GM8/vjj+W5KVtx5551YuXIlACAWi8FmK9ydtwo2eEutI75z507Mnz8fX/va19DW1oaXX345T63LnFL/fD4ftm3bhieeeCJPrdOHXB9XrVqFt99+O0+typ5gMAin05n42WazYWRkRNVmJUaxcuVKXLp0Kd/NyJopU6YAGP23fOihh/DII4/kuUXyCvZbJbWOeNwvfvELdHR04L777kNLS0uOW6YPuf5duHABjz76KB5//HEsXrw4Dy3Tj9K/oRk5nc5xyyNHo1FTBe7J4sqVK9i6dSsaGxuxevXqfDdHlqFy3i+99BIOHToEYPQOWch/0mjR3t6Ohx9+GLt27cKyZcvy3RxK04IFC/DXv/4VAHD27Fl4vd48t4jS1dXVhc2bN2Pbtm1Yt25dvpujyFDDgrVr1+I73/kOXnnlFYiiiJ07d+a7SbratWsXwuEwnnnmGQCjI7ndu3fnuVWk1ooVK3D8+HFs3LgRsVjMdN/PyeDFF19Ef38/XnjhhURBxJ49e+BwOPLcsom4JCwRkQEZKm1CRESjGLyJiAyIwZuIyIAYvImIDIjBm4jIgBi8iYgMiMGbiMiA/j+GJWvaa1TpYwAAAABJRU5ErkJggg==\n", "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -502,9 +756,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -526,9 +778,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -556,9 +806,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -597,9 +845,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -626,9 +872,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -653,9 +897,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -684,9 +926,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -719,9 +959,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -759,9 +997,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -788,9 +1024,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -826,9 +1060,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -860,9 +1092,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -919,9 +1149,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb index 597ecaa2e..dba4fdaee 100644 --- a/notebooks/Untitled.ipynb +++ b/notebooks/Untitled.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -114,6 +114,158 @@ "%f (c for c in \"abc\") py> list(pop()) tib.\n", "\n" ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[12, 34]\n" + ] + }, + { + "data": { + "text/plain": [ + "[[12, 34]]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = [12,34]\n", + "%f x . cr\n", + "[x]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([12, 34])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "[array([[12],\n", + " [34]]), [91, 92]]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([[[12]],\n", + "\n", + " [[34]]])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array([[[[12]]],\n", + "\n", + "\n", + " [[[34]]]])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "[array([12, 34])]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = np.array([12,34])\n", + "x\n", + "[x[:,np.newaxis], [91,92]]\n", + "x[:,np.newaxis][:,np.newaxis]\n", + "x[:,np.newaxis][:,np.newaxis][:,np.newaxis]\n", + "%f \\ --------------------------\n", + "[x]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[12, 34]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "ename": "TypeError", + "evalue": "list indices must be integers or slices, not tuple", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;36m12\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m34\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mTypeError\u001b[0m: list indices must be integers or slices, not tuple" + ] + } + ], + "source": [ + "x = [12,34]\n", + "x\n", + "x[:,np.newaxis]\n", + "x[:,np.newaxis][:,np.newaxis]\n", + "x[:,np.newaxis][:,np.newaxis][:,np.newaxis]" + ] } ], "metadata": { From 75b7069aa4117a3e97b0454ed17c50aa135ba299 Mon Sep 17 00:00:00 2001 From: "H.C. Chen" Date: Tue, 31 Jul 2018 18:37:33 +0800 Subject: [PATCH 09/13] 7/31 --- .../02.06-Boolean-Arrays-and-Masks.ipynb | 160 ++---- notebooks/03.00-Introduction-to-Pandas.ipynb | 23 +- .../03.01-Introducing-Pandas-Objects.ipynb | 509 ++++++++++++------ .../03.02-Data-Indexing-and-Selection.ipynb | 172 +++--- .../{Untitled.ipynb => Playground.ipynb} | 154 +++++- notebooks/peforth.ipynb | 45 ++ 6 files changed, 702 insertions(+), 361 deletions(-) rename notebooks/{Untitled.ipynb => Playground.ipynb} (74%) create mode 100644 notebooks/peforth.ipynb diff --git a/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb b/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb index 5a7e150c2..8bee22503 100644 --- a/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb +++ b/notebooks/02.06-Boolean-Arrays-and-Masks.ipynb @@ -48,9 +48,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -85,9 +83,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -98,9 +94,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -163,9 +157,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -185,9 +177,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -207,9 +197,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -229,9 +217,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -251,9 +237,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -273,9 +257,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -302,9 +284,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -346,9 +326,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -372,9 +350,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -413,9 +389,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -443,9 +417,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -474,9 +446,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -503,9 +473,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -535,9 +503,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -558,9 +524,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -581,9 +545,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -604,9 +566,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -634,9 +594,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -680,9 +638,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -717,9 +673,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -766,9 +720,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -803,9 +755,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -834,9 +784,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -865,9 +813,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -897,9 +843,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -955,9 +899,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -977,9 +919,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -999,9 +939,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1028,9 +966,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1050,9 +986,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1072,9 +1006,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1094,9 +1026,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1125,9 +1055,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1156,9 +1084,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "ValueError", @@ -1186,9 +1112,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1216,9 +1140,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "ValueError", @@ -1270,9 +1192,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/03.00-Introduction-to-Pandas.ipynb b/notebooks/03.00-Introduction-to-Pandas.ipynb index 644a1bad0..65d6e77db 100644 --- a/notebooks/03.00-Introduction-to-Pandas.ipynb +++ b/notebooks/03.00-Introduction-to-Pandas.ipynb @@ -60,14 +60,12 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'0.18.1'" + "'0.20.1'" ] }, "execution_count": 1, @@ -90,9 +88,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd" @@ -128,6 +124,15 @@ "More detailed documentation, along with tutorials and other resources, can be found at http://pandas.pydata.org/." ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "peforth?" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -154,9 +159,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/03.01-Introducing-Pandas-Objects.ipynb b/notebooks/03.01-Introducing-Pandas-Objects.ipynb index bdee556f5..b1b9bce35 100644 --- a/notebooks/03.01-Introducing-Pandas-Objects.ipynb +++ b/notebooks/03.01-Introducing-Pandas-Objects.ipynb @@ -1,5 +1,34 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -40,9 +69,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -62,9 +89,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -96,10 +121,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, + "execution_count": 4, + "metadata": {}, "outputs": [ { "data": { @@ -107,7 +130,7 @@ "array([ 0.25, 0.5 , 0.75, 1. ])" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -125,24 +148,30 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, + "execution_count": 19, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "RangeIndex(start=0, stop=4, step=1)" + "Index(['a', 'b', 'c', 'd'], dtype='object')" ] }, - "execution_count": 4, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "往下看,這個 index 會被 casted 成最適合的 type\n" + ] } ], "source": [ - "data.index" + "data.index\n", + "%f \\ 往下看,這個 index 會被 casted 成最適合的 type " ] }, { @@ -155,9 +184,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -177,9 +204,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -225,10 +250,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, + "execution_count": 18, + "metadata": {}, "outputs": [ { "data": { @@ -240,7 +263,17 @@ "dtype: float64" ] }, - "execution_count": 7, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "Index(['a', 'b', 'c', 'd'], dtype='object')" + ] + }, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -248,7 +281,8 @@ "source": [ "data = pd.Series([0.25, 0.5, 0.75, 1.0],\n", " index=['a', 'b', 'c', 'd'])\n", - "data" + "data\n", + "data.index" ] }, { @@ -261,9 +295,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -280,6 +312,32 @@ "data['b']" ] }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "故意用複數當 index 也行, wow!\n", + "(1+1j) 0.25\n", + "(2+2j) 0.50\n", + "(3+3j) 0.75\n", + "(4+4j) 1.00\n", + "dtype: float64\n", + "Index([(1+1j), (2+2j), (3+3j), (4+4j)], dtype='object')\n" + ] + } + ], + "source": [ + "%f \\ 故意用複數當 index 也行, wow!\n", + "%f pd :>~ Series([0.25, 0.5, 0.75, 1.0], index=[1+1j, 2+2j, 3+3j, 4+4j])\n", + "%f dup . cr\n", + "%f :> index . cr" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -289,10 +347,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, + "execution_count": 14, + "metadata": {}, "outputs": [ { "data": { @@ -304,7 +360,7 @@ "dtype: float64" ] }, - "execution_count": 9, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -317,10 +373,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, + "execution_count": 15, + "metadata": {}, "outputs": [ { "data": { @@ -328,13 +382,51 @@ "0.5" ] }, - "execution_count": 10, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "Int64Index([2, 5, 3, 7], dtype='int64')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[5]\n", + "data.index" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "故意用 index 與 character 混和\n", + "2 0.25\n", + "a 0.50\n", + "3 0.75\n", + "b 1.00\n", + "dtype: float64\n", + "Index([2, 'a', 3, 'b'], dtype='object')\n", + "注意看,這樣搞會把 index 的 type 提升成 object 完美呀!\n" + ] } ], "source": [ - "data[5]" + "%f \\ 故意用 index 與 character 混和\n", + "%f pd :>~ Series([0.25, 0.5, 0.75, 1.0], index=[2, 'a', 3, 'b'])\n", + "%f dup . cr\n", + "%f :> index . cr \\ 注意看,這樣搞會把 index 的 type 提升成 object 完美呀!" ] }, { @@ -344,7 +436,7 @@ "### Series as specialized dictionary\n", "\n", "In this way, you can think of a Pandas ``Series`` a bit like a specialization of a Python dictionary.\n", - "A dictionary is a structure that maps arbitrary keys to a set of arbitrary values, and a ``Series`` is a structure which maps typed keys to a set of typed values.\n", + "A dictionary is a structure that maps **arbitrary keys** to a set of **arbitrary values**, and a ``Series`` is a structure which maps **typed keys** to a set of **typed values**.\n", "This typing is important: just as the type-specific compiled code behind a NumPy array makes it more efficient than a Python list for certain operations, the type information of a Pandas ``Series`` makes it much more efficient than Python dictionaries for certain operations.\n", "\n", "The ``Series``-as-dictionary analogy can be made even more clear by constructing a ``Series`` object directly from a Python dictionary:" @@ -352,10 +444,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, + "execution_count": 21, + "metadata": {}, "outputs": [ { "data": { @@ -368,7 +458,17 @@ "dtype: int64" ] }, - "execution_count": 11, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')" + ] + }, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -380,7 +480,8 @@ " 'Florida': 19552860,\n", " 'Illinois': 12882135}\n", "population = pd.Series(population_dict)\n", - "population" + "population\n", + "population.index" ] }, { @@ -394,9 +495,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -417,15 +516,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Unlike a dictionary, though, the ``Series`` also supports array-style operations such as slicing:" + "Unlike a dictionary, though, the ``Series`` also supports array-style operations such as (記住,array 的冒號操作稱為...) **slicing** :" ] }, { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -472,9 +569,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -498,15 +593,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "``data`` can be a scalar, which is repeated to fill the specified index:" + "``data`` can be a scalar, which is repeated to fill the specified index:\n", + "\n", + "有點沒想到,但仍在情理之中,寫法也很直白" ] }, { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -536,9 +631,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -562,15 +655,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In each case, the index can be explicitly set if a different result is preferred:" + "In each case, the index can be explicitly set if a different result is preferred:\n", + "\n", + "這樣就變成從 data 中按 index 挑出某些 item 出來" ] }, { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -612,19 +705,19 @@ "metadata": {}, "source": [ "### DataFrame as a generalized NumPy array\n", - "If a ``Series`` is an analog of a one-dimensional array with flexible indices, a ``DataFrame`` is an analog of a two-dimensional array with both flexible row indices and flexible column names.\n", + "If a ``Series`` is an analog of a one-dimensional array with flexible indices, a ``DataFrame`` is an analog of a two-dimensional array **with both flexible row indices and flexible column names**.\n", "Just as you might think of a two-dimensional array as an ordered sequence of aligned one-dimensional columns, you can think of a ``DataFrame`` as a sequence of aligned ``Series`` objects.\n", "Here, by \"aligned\" we mean that they share the same index.\n", "\n", - "To demonstrate this, let's first construct a new ``Series`` listing the area of each of the five states discussed in the previous section:" + "To demonstrate this, let's first construct a new ``Series`` listing the area of each of the five states discussed in the previous section:\n", + "\n", + "上面是 population 這裡是 area 於是就有了兩個 aligned series 了,共用 index 州名" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, + "execution_count": 22, + "metadata": {}, "outputs": [ { "data": { @@ -637,7 +730,7 @@ "dtype: int64" ] }, - "execution_count": 18, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -653,15 +746,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have this along with the ``population`` Series from before, we can use a dictionary to construct a single two-dimensional object containing this information:" + "Now that we have this along with the ``population`` Series from before, we can use a dictionary to construct a single two-dimensional object containing this information:\n", + "\n", + "到這裡已經有點迷糊了。還好,【比對】是學習的利器:上面用 dictionary 餵給 pd.Series() 分別 dictionary 的 key 變成 index 而 value 變成 data. 此處又用 dictionary 餵給 pd.DataFrame 時,dictionary 的 key 變成 column name 而 dictionary 的 value 則都是 pd.Series, 也就是一豎一豎的 pd.Series 用 dictionary 並起來。\n" ] }, { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -735,9 +828,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -764,9 +855,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -798,15 +887,15 @@ "\n", "Similarly, we can also think of a ``DataFrame`` as a specialization of a dictionary.\n", "Where a dictionary maps a key to a value, a ``DataFrame`` maps a column name to a ``Series`` of column data.\n", - "For example, asking for the ``'area'`` attribute returns the ``Series`` object containing the areas we saw earlier:" + "For example, asking for the ``'area'`` attribute returns the ``Series`` object containing the areas we saw earlier:\n", + "\n", + "豎的 Column 是【大】index; 橫的 index 是【小】index, 照中華習慣合理地由大到小作用,因此 pd.DataFrame 看成是以 column 為 index 的 Series. 注意!我老是一眼把它看成橫的,錯了!要先看成豎的,再細看成橫的。照理說橫豎是對稱的,因此要橫著看也行,要用上 pd.iloc pd.loc. " ] }, { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -859,9 +948,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -930,9 +1017,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -994,9 +1079,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1055,9 +1138,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1133,9 +1214,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1194,44 +1273,74 @@ "#### From a NumPy structured array\n", "\n", "We covered structured arrays in [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb).\n", - "A Pandas ``DataFrame`` operates much like a structured array, and can be created directly from one:" + "A Pandas ``DataFrame`` operates much like a structured array, and can be created directly from one:\n", + "\n", + "這有點煩,原因是 numpy 對所有的表達方式都 support 上了。有點類似前面舉的例子: pd.Series(5,index=[1,2,3]) 也是遷就 index 使 scalar 擴展成 array. 比較意外的是,np.array 也可以有 column name!" ] }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": false - }, + "execution_count": 51, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([(0, 0.0), (0, 0.0), (0, 0.0)], \n", + "array([(0, 0.), (0, 0.), (0, 0.)],\n", " dtype=[('A', '\n", + "\n", "
\n", " \n", " \n", @@ -1267,13 +1376,111 @@ "2 0 0.0" ] }, - "execution_count": 29, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "寫出 B 例也花了不少功夫,想像出每種寫法形成的 shape 會怎樣或反之,都很難。\n" + ] + }, + { + "data": { + "text/plain": [ + "array([(1, 2.), (3, 4.), (5, 6.)],\n", + " dtype=[('A', '\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
012.0
134.0
256.0
\n", + "
" + ], + "text/plain": [ + " A B\n", + "0 1 2.0\n", + "1 3 4.0\n", + "2 5 6.0" + ] + }, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "這裡必須用 tuple (1,2) 不能用 list [1,2] 看不懂。\n", + "a bytes-like object is required, not 'int'\n" + ] } ], "source": [ - "pd.DataFrame(A)" + "A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])\n", + "A\n", + "A.shape\n", + "A[0]\n", + "A[0][1]\n", + "pd.DataFrame(A)\n", + "\n", + "%f \\ 寫出 B 例也花了不少功夫,想像出每種寫法形成的 shape 會怎樣或反之,都很難。\n", + "B = np.array([(1,2), (3,4), (5,6)], dtype=[('A', 'i8'), ('B', 'f8')])\n", + "B\n", + "pd.DataFrame(B)\n", + "\n", + "%f \\ 這裡必須用 tuple (1,2) 不能用 list [1,2] 看不懂。\n", + "try:\n", + " np.array([[1,2], [3,4], [5,6]], dtype=[('A', 'i8'), ('B', 'f8')])\n", + "except TypeError as e:\n", + " print(e)\n" ] }, { @@ -1291,9 +1498,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1324,9 +1529,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1346,9 +1549,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1375,9 +1576,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1401,9 +1600,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "TypeError", @@ -1442,9 +1639,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "indA = pd.Index([1, 3, 5, 7, 9])\n", @@ -1454,9 +1649,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1476,9 +1669,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1498,9 +1689,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1550,9 +1739,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/03.02-Data-Indexing-and-Selection.ipynb b/notebooks/03.02-Data-Indexing-and-Selection.ipynb index 416af5063..9302166c6 100644 --- a/notebooks/03.02-Data-Indexing-and-Selection.ipynb +++ b/notebooks/03.02-Data-Indexing-and-Selection.ipynb @@ -1,5 +1,23 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "%run peforth.ipynb" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -31,7 +49,7 @@ "metadata": {}, "source": [ "In [Chapter 2](02.00-Introduction-to-NumPy.ipynb), we looked in detail at methods and tools to access, set, and modify values in NumPy arrays.\n", - "These included indexing (e.g., ``arr[2, 1]``), slicing (e.g., ``arr[:, 1:5]``), masking (e.g., ``arr[arr > 0]``), fancy indexing (e.g., ``arr[0, [1, 5]]``), and combinations thereof (e.g., ``arr[:, [1, 5]]``).\n", + "These included indexing 指定 (e.g., ``arr[2, 1]``), slicing 區段 (e.g., ``arr[:, 1:5]``), masking 條件 (e.g., ``arr[arr > 0]``), fancy indexing 列舉 (e.g., ``arr[0, [1, 5]]``), and combinations thereof (e.g., ``arr[:, [1, 5]]``).\n", "Here we'll look at similar means of accessing and modifying values in Pandas ``Series`` and ``DataFrame`` objects.\n", "If you have used the NumPy patterns, the corresponding patterns in Pandas will feel very familiar, though there are a few quirks to be aware of.\n", "\n", @@ -59,36 +77,7 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reDef unknown\n", - "reDef \\\n" - ] - } - ], - "source": [ - "import peforth\n", - "peforth.dictate(r'''\n", - "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", - ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", - " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", - " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", - " /// here after, when FORTH come accross an unknown token, instead of alerting \n", - " /// it try to find the token in python __main__ module name space.\n", - "\n", - "\\ Redefine \\ command to print the comment line\n", - "code \\ print(nexttoken('\\n')) end-code \n", - "''');\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -101,7 +90,7 @@ "dtype: float64" ] }, - "execution_count": 6, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -115,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -124,7 +113,7 @@ "0.5" ] }, - "execution_count": 2, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -142,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -151,7 +140,7 @@ "True" ] }, - "execution_count": 3, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -162,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -171,13 +160,24 @@ "Index(['a', 'b', 'c', 'd'], dtype='object')" ] }, - "execution_count": 4, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "Index(['a', 'b', 'c', 'd'], dtype='object')" + ] + }, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.keys()" + "data.keys()\n", + "data.index" ] }, { @@ -210,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -224,14 +224,44 @@ "dtype: float64" ] }, - "execution_count": 6, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "改 value 可以,任意增減改 index 就不行,因為 pd.index 是 immutable\n" + ] + }, + { + "data": { + "text/plain": [ + "'a'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index does not support mutable operations\n" + ] } ], "source": [ "data['e'] = 1.25\n", - "data" + "data\n", + "%f \\ 改 value 可以,任意增減改 index 就不行,因為 pd.index 是 immutable\n", + "data.index[0]\n", + "try:\n", + " data.index[0]='aa'\n", + "except TypeError as e:\n", + " print(e)" ] }, { @@ -277,7 +307,14 @@ ], "source": [ "# slicing by explicit index\n", - "data['a':'c']" + "data['a':'c'] # 有含 'c' !!!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "所以實際上 pd.Series 有兩套 index : implicit index `.iloc()`, 跟 explicit keys() `.loc()`" ] }, { @@ -300,7 +337,7 @@ ], "source": [ "# slicing by implicit integer index\n", - "data[0:2]" + "data[0:2] # 不含 2 !!! " ] }, { @@ -345,7 +382,7 @@ } ], "source": [ - "# fancy indexing\n", + "# fancy indexing 所謂 fancy indexing 就是用 list 列舉你要的 index \n", "data[['a', 'e']]" ] }, @@ -369,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -381,7 +418,7 @@ "dtype: object" ] }, - "execution_count": 11, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -393,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -402,7 +439,7 @@ "'a'" ] }, - "execution_count": 12, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -414,7 +451,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -425,7 +462,7 @@ "dtype: object" ] }, - "execution_count": 13, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -447,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -456,13 +493,26 @@ "'a'" ] }, - "execution_count": 14, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data :> keys() tib. \\ ==> Int64Index([1, 3, 5], dtype='int64') ()\n", + "1 a\n", + "3 b\n", + "5 c\n", + "dtype: object\n" + ] } ], "source": [ - "data.loc[1]" + "data.loc[1] # 這個 1 是 data.keys() 的 1\n", + "%f data :> keys() tib.\n", + "%f data . cr" ] }, { @@ -496,7 +546,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -505,13 +555,13 @@ "'b'" ] }, - "execution_count": 16, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.iloc[1]" + "data.iloc[1] # 這個 1 是內部 index 的 1" ] }, { @@ -1393,7 +1443,9 @@ "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects. 我有看到警告說 .ix will be deprecated soon 的確,混和的結果又把兩種 style 的模糊性帶回來了!\n", "\n", "Any of the familiar NumPy-style data access patterns can be used within these indexers.\n", - "For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:" + "For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:\n", + "\n", + "所謂 fancy indexing 就是用 list 列舉你要的 index " ] }, { @@ -1534,7 +1586,7 @@ ], "source": [ "%f \\ column indexing 一定要放在逗點之後,\n", - "%f \\ . . . 或者要用 fancy indexing 的形式—放在 list 裡面\n", + "%f \\ . . . 或者要用 fancy indexing 的形式—放在 list 裡面列舉你要的 index\n", "data[['pop','density']][data.density > 100]" ] }, @@ -2000,7 +2052,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.6.0" } }, "nbformat": 4, diff --git a/notebooks/Untitled.ipynb b/notebooks/Playground.ipynb similarity index 74% rename from notebooks/Untitled.ipynb rename to notebooks/Playground.ipynb index dba4fdaee..aa5b2c329 100644 --- a/notebooks/Untitled.ipynb +++ b/notebooks/Playground.ipynb @@ -2,7 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -31,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -40,9 +49,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UsageError: Missing filename, URL, input history range, macro, or element in the user namespace.\n" + ] + } + ], "source": [ "%load " ] @@ -115,15 +132,6 @@ "\n" ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, { "cell_type": "code", "execution_count": 16, @@ -266,6 +274,126 @@ "x[:,np.newaxis][:,np.newaxis]\n", "x[:,np.newaxis][:,np.newaxis][:,np.newaxis]" ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "普通 array 沒有這種能力,這叫 numpy 的啥能力.... 稱作 ufunc (Universal Functions)\n" + ] + }, + { + "data": { + "text/plain": [ + "['a', 'b', 'c', 'd', 'e', 'f']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'>' not supported between instances of 'list' and 'str'\n" + ] + } + ], + "source": [ + "# 憑印象 try mask \n", + "%f \\ 普通 array 沒有這種能力,這叫 numpy 的啥能力.... 稱作 ufunc (Universal Functions)\n", + "a = [c for c in \"abcdef\"] \n", + "a\n", + "try:\n", + " a > 'd'\n", + "except TypeError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ True, True, True, False, False, False], dtype=bool)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "array(['a', 'b', 'c'],\n", + " dtype='" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Now we redefine the 'unknown' command that was do-nothing by default\n", + "reDef unknown\n", + "Redefine \\ command to print the comment line\n", + "reDef \\\n" + ] + } + ], + "source": [ + "%run peforth.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "%run?" + ] } ], "metadata": { diff --git a/notebooks/peforth.ipynb b/notebooks/peforth.ipynb new file mode 100644 index 000000000..3b728ce4c --- /dev/null +++ b/notebooks/peforth.ipynb @@ -0,0 +1,45 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import peforth\n", + "peforth.dictate(r'''\n", + "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", + ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", + " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", + " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", + " /// here after, when FORTH come accross an unknown token, instead of alerting \n", + " /// it try to find the token in python __main__ module name space.\n", + "\n", + "\\ Redefine \\ command to print the comment line\n", + "code \\ print(nexttoken('\\n')) end-code \n", + "''');\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 37b5c3d37c2f3459cda38a13c6f70f74add5ab5c Mon Sep 17 00:00:00 2001 From: "H.C. Chen" Date: Fri, 3 Aug 2018 19:03:49 +0800 Subject: [PATCH 10/13] 8/3 --- .../03.02-Data-Indexing-and-Selection.ipynb | 301 ++++--- notebooks/03.03-Operations-in-Pandas.ipynb | 613 +++++++++----- notebooks/03.04-Missing-Values.ipynb | 19 +- notebooks/03.05-Hierarchical-Indexing.ipynb | 13 +- notebooks/03.06-Concat-And-Append.ipynb | 326 +++++-- notebooks/03.07-Merge-and-Join.ipynb | 786 +++++++++++++++-- notebooks/Index.ipynb | 4 +- notebooks/Playground.ipynb | 797 +++++++++++++----- 8 files changed, 2176 insertions(+), 683 deletions(-) diff --git a/notebooks/03.02-Data-Indexing-and-Selection.ipynb b/notebooks/03.02-Data-Indexing-and-Selection.ipynb index 9302166c6..40e3fd5e9 100644 --- a/notebooks/03.02-Data-Indexing-and-Selection.ipynb +++ b/notebooks/03.02-Data-Indexing-and-Selection.ipynb @@ -2,14 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "Now we redefine the 'unknown' command that was do-nothing by default\n", "reDef unknown\n", + "Redefine \\ command to print the comment line\n", "reDef \\\n" ] } @@ -77,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -90,7 +92,7 @@ "dtype: float64" ] }, - "execution_count": 9, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -314,7 +316,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "所以實際上 pd.Series 有兩套 index : implicit index `.iloc()`, 跟 explicit keys() `.loc()`" + "所以實際上 pd.Series 有兩套 index : implicit index `.iloc[]` or positional indexing, 跟 explicit keys() `.loc[]` or label based indexing" ] }, { @@ -619,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -686,7 +688,7 @@ "Texas 695662 26448193" ] }, - "execution_count": 8, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -710,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -724,7 +726,7 @@ "Name: area, dtype: int64" ] }, - "execution_count": 19, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -742,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -756,7 +758,7 @@ "Name: area, dtype: int64" ] }, - "execution_count": 20, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -774,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -783,7 +785,7 @@ "True" ] }, - "execution_count": 21, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -796,6 +798,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "#### 前者 `data.area` 是 object 的 attribute-style 而後者 `data['area']` 是 dictionary-style, 當然後者比較周延。 \n", "Though this is a useful shorthand, keep in mind that it does not work for all cases!\n", "For example, if the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this **attribute-style** access is not possible.\n", "For example, the ``DataFrame`` has a ``pop()`` method, so ``data.pop`` will point to this rather than the ``\"pop\"`` column:" @@ -803,7 +806,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -812,7 +815,7 @@ "False" ] }, - "execution_count": 22, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -825,14 +828,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In particular, you should avoid the temptation to try column assignment via attribute (i.e., use ``data['pop'] = z`` rather than ``data.pop = z``).\n", + "In particular, you should avoid the temptation to try column assignment via attribute (i.e., use ``data['pop'] = z`` rather than ``data.pop = z``). 後者跑進 data object 的 attribute 裡去了(而非 DataFrame),故無警告!\n", "\n", - "Like with the ``Series`` objects discussed earlier, this **dictionary-style** syntax can also be used to modify the object, in this case adding a new column:" + "Like with the ``Series`` objects discussed earlier, this **dictionary-style** syntax can also be used to modify the object, in this case adding a new column:\n", + "\n", + "下面這個 ufunc 的應用完勝 excel" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -905,7 +910,7 @@ "Texas 695662 26448193 38.018740" ] }, - "execution_count": 9, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -915,11 +920,108 @@ "data" ] }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "California 90.413926\n", + "Florida 114.806121\n", + "Illinois 85.883763\n", + "New York 139.076746\n", + "Texas 38.018740\n", + "dtype: float64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areapop
California42396738332521
Florida17031219552860
Illinois14999512882135
New York14129719651127
Texas69566226448193
\n", + "
" + ], + "text/plain": [ + " area pop\n", + "California 423967 38332521\n", + "Florida 170312 19552860\n", + "Illinois 149995 12882135\n", + "New York 141297 19651127\n", + "Texas 695662 26448193" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 用 attribute-style 加上的新 member 就是跑進 data object 的 attribute 裡去了\n", + "#,而非所願的 DataFrame, 不在 DataFrame 裡! \n", + "data.test = data['pop'] / data['area']\n", + "data.test\n", + "data" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This shows a preview of the straightforward syntax of element-by-element arithmetic between ``Series`` objects; we'll dig into this further in [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb)." + "This shows a preview of the straightforward syntax of element-by-element arithmetic (我看就是 ufunc Universal Function) between ``Series`` objects; we'll dig into this further in [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb)." ] }, { @@ -934,22 +1036,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 44, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 4.23967000e+05 3.83325210e+07 9.04139261e+01]\n", - " [ 1.70312000e+05 1.95528600e+07 1.14806121e+02]\n", - " [ 1.49995000e+05 1.28821350e+07 8.58837628e+01]\n", - " [ 1.41297000e+05 1.96511270e+07 1.39076746e+02]\n", - " [ 6.95662000e+05 2.64481930e+07 3.80187404e+01]]\n", - "data :> values type tib. \\ ==> ()\n", - ".values 就是個簡單的 ndarray\n" - ] - }, { "data": { "text/plain": [ @@ -960,15 +1049,22 @@ " [ 6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])" ] }, - "execution_count": 21, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data :> values type tib. \\ ==> ()\n", + ".values 就是個簡單的 ndarray\n" + ] } ], "source": [ - "%f data :> values . cr\n", - "%f data :> values type tib. \\ .values 就是個簡單的 ndarray\n", - "data.values" + "data.values\n", + "%f data :> values type tib. \\ .values 就是個簡單的 ndarray" ] }, { @@ -976,7 +1072,7 @@ "metadata": {}, "source": [ "With this picture in mind, many familiar array-like observations can be done on the ``DataFrame`` itself.\n", - "For example, we can transpose the full ``DataFrame`` to swap rows and columns:" + "For example, we can transpose (轉置矩陣) the full ``DataFrame`` to swap rows and columns:" ] }, { @@ -1062,7 +1158,7 @@ } ], "source": [ - "%f \\ .T 之後,應該還是個 df \n", + "%f \\ .T transpose 之後,應該還是個 df \n", "%f data :> T type tib.\n", "data.T " ] @@ -1071,7 +1167,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When it comes to indexing of ``DataFrame`` objects, however, it is clear that the dictionary-style indexing of columns precludes our ability to simply treat it as a NumPy array.\n", + "When it comes to indexing of ``DataFrame`` objects, however, it is clear that the dictionary-style indexing of columns precludes 妨礙 our ability to simply treat it as a NumPy array.\n", "In particular, passing a single index to an array accesses a row:" ] }, @@ -1099,7 +1195,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "and passing a single \"index\" to a ``DataFrame`` accesses a column:" + "and passing a single \"index\" to a ``DataFrame`` accesses a column:\n", + "\n", + "我就說嘛!我就說嘛!DataFrame 的 indexing 變成是 column 導向,而非習慣上 Matrix 的 Row 導向。因為 DataFrame 是 pd.Series 組成的 dictionary, 他的最上層 index 指的是 column 也就是 Series. data.loc 就是把他又變回 array or matrix 的習慣, 因此 `data[n]`\n", + "與 `data.loc(n)`,`data.iloc(n)` 之間是視角上的不同。" ] }, { @@ -1440,17 +1539,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects. 我有看到警告說 .ix will be deprecated soon 的確,混和的結果又把兩種 style 的模糊性帶回來了!\n", + "Keep in mind that for integer indices, the ``ix`` indexer is subject to the same potential sources of confusion as discussed for integer-indexed ``Series`` objects. \n", + "\n", + "我有看到警告說 .ix will be deprecated soon 的確,混和的結果又把兩種 style 的模糊性帶回來了!\n", "\n", - "Any of the familiar NumPy-style data access patterns can be used within these indexers.\n", - "For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:\n", + "Any of the familiar NumPy-style data access patterns can be used within these indexers. For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:\n", "\n", "所謂 fancy indexing 就是用 list 列舉你要的 index " ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 45, "metadata": { "scrolled": true }, @@ -1459,7 +1559,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "column indexing 一定要放在逗點之後\n" + "因為 .loc 的視角 column indexing 要放在逗點之後\n" ] }, { @@ -1508,19 +1608,19 @@ "New York 19651127 139.076746" ] }, - "execution_count": 71, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%f \\ column indexing 一定要放在逗點之後\n", + "%f \\ 因為 .loc 的視角 column indexing 要放在逗點之後\n", "data.loc[data.density > 100, ['pop', 'density']]" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 51, "metadata": { "scrolled": false }, @@ -1529,8 +1629,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "column indexing 一定要放在逗點之後,\n", - ". . . 或者要用 fancy indexing 的形式—放在 list 裡面\n" + "先用 fancy indexing 的形式—放在 list 裡面列舉你要的 index (columns) 結果是個 df\n", + "然後再 mask, 針對某 column 取 mask 結果是撈出某些 row 出來\n" ] }, { @@ -1579,20 +1679,29 @@ "New York 19651127 139.076746" ] }, - "execution_count": 73, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_ type tib. \\ ==> ()\n", + "撈出來還是 data-frame\n" + ] } ], "source": [ - "%f \\ column indexing 一定要放在逗點之後,\n", - "%f \\ . . . 或者要用 fancy indexing 的形式—放在 list 裡面列舉你要的 index\n", - "data[['pop','density']][data.density > 100]" + "%f \\ 先用 fancy indexing 的形式—放在 list 裡面列舉你要的 index (columns) 結果是個 df\n", + "%f \\ 然後再 mask, 針對某 column 取 mask 結果是撈出某些 row 出來\n", + "data[['pop','density']][data.density > 100]\n", + "%f _ type tib. \\ 撈出來還是 data-frame " ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 55, "metadata": { "scrolled": false }, @@ -1653,14 +1762,15 @@ "New York 141297 19651127 139.076746" ] }, - "execution_count": 80, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%f \\ 我發現這個問題!下面這個尾巴 .loc[:'亂寫也行'] 等於是 [:] 取所有的 rows !!\n", - "data.loc[data.density > 100].loc[:'area']" + "data.loc[data.density > 100].loc[:'area'] \n", + "# .loc[] 是 row oriented 故 'area' 不存在,以上 .loc[:'area'] 等於 .loc[:] 全部!" ] }, { @@ -1672,13 +1782,26 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1693,7 +1816,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1725,20 +1848,20 @@ ], "text/plain": [ " area pop density\n", - "California 423967 38332521 90.000000\n", + "California 423967 38332521 93.000000\n", "Florida 170312 19552860 114.806121\n", "Illinois 149995 12882135 85.883763\n", "New York 141297 19651127 139.076746\n", "Texas 695662 26448193 38.018740" ] }, - "execution_count": 32, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.iloc[0, 2] = 90\n", + "data.iloc[0, 2] = 93\n", "data" ] }, @@ -1761,9 +1884,16 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 60, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "原版 df[] 不經 .loc[] 的 slicing refers to rows 這真的很奇怪。\n" + ] + }, { "data": { "text/html": [ @@ -1813,73 +1943,56 @@ "Illinois 149995 12882135 85.883763" ] }, - "execution_count": 25, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%f \\ slicing refers to rows\n", + "%f \\ 原版 df[] 不經 .loc[] 的 slicing refers to rows 這真的很奇怪。\n", "data['Florida':'Illinois']" ] }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "indexing refers to columns 這句話沒講清楚,\n", - "California 90.413926\n", + "df[] indexing refers to columns 這是 data-frame 的基本視角\n", + "California 93.000000\n", "Florida 114.806121\n", "Illinois 85.883763\n", "New York 139.076746\n", "Texas 38.018740\n", "Name: density, dtype: float64\n", "以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", - "注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", - "所以作者要強調它,反正就是這樣!\n", "\n", "Failed in (compiling=False): 'lalalala'\n", "Body:\n", "push(pop()['lalalala'])\n", - "以上當 key 不認得時,讓 python 接著處裡就出錯了!\n" + "以上當 key 不認得時,除非是 slicing 否則就出錯了!\n" ] - }, - { - "data": { - "text/plain": [ - "California 90.413926\n", - "Florida 114.806121\n", - "Illinois 85.883763\n", - "New York 139.076746\n", - "Texas 38.018740\n", - "Name: density, dtype: float64" - ] - }, - "execution_count": 98, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%f \\ indexing refers to columns 這句話沒講清楚,\n", - "%f data :> ['density'] . cr \\ 以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", - "%f \\ 注意!*當作 column-wise 處理* 不是很奇怪嗎?特別是與上面的 slicing 對照起來看。\n", - "%f \\ 所以作者要強調它,反正就是這樣!\n", + "%f \\ df[] indexing refers to columns 這是 data-frame 的基本視角\n", + "%f data :> ['density'] . cr \n", + "%f \\ 以上當認得時,先被 df 拿去當作 column name 處理了,所以 OK\n", "%f data :> ['lalalala'] . cr \n", - "%f \\ 以上當 key 不認得時,讓 python 接著處裡就出錯了!\n", - "data['density']" + "%f \\ 以上當 key 不認得時,除非是 slicing 否則就出錯了!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Such slices can also refer to rows by number rather than by index:" + "Such slices can also refer to rows by number rather than by index:\n", + "\n", + "這是最奇特的地方了! slices refer to rows " ] }, { diff --git a/notebooks/03.03-Operations-in-Pandas.ipynb b/notebooks/03.03-Operations-in-Pandas.ipynb index 6a0806463..310104343 100644 --- a/notebooks/03.03-Operations-in-Pandas.ipynb +++ b/notebooks/03.03-Operations-in-Pandas.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -15,18 +15,7 @@ } ], "source": [ - "import peforth\n", - "peforth.dictate(r'''\n", - "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", - ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", - " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", - " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", - " /// here after, when FORTH come accross an unknown token, instead of alerting \n", - " /// it try to find the token in python __main__ module name space.\n", - "\n", - "\\ Redefine \\ command to print the comment line\n", - "code \\ print(nexttoken('\\n')) end-code \n", - "''');\n" + "%run peforth.ipynb" ] }, { @@ -62,7 +51,7 @@ "One of the essential pieces of NumPy is the ability to perform quick element-wise operations, both with basic arithmetic (addition, subtraction, multiplication, etc.) and with more sophisticated operations (trigonometric functions, exponential and logarithmic functions, etc.).\n", "Pandas inherits much of this functionality from NumPy, and the ufuncs that we introduced in [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb) are key to this.\n", "\n", - "Pandas includes a couple useful twists, however: for unary operations like negation and trigonometric functions, these ufuncs will *preserve index and column labels* in the output, and for binary operations such as addition and multiplication, Pandas will automatically *align indices* when passing the objects to the ufunc. 不知所云,往下看吧。。。\n", + "Pandas includes a couple useful twists, however: for unary operations like negation and trigonometric functions, these ufuncs will *preserve index and column labels* in the output, and for binary operations such as addition and multiplication, Pandas will automatically *align indices* when passing the objects to the ufunc. 兩元運算,餵兩個 data-frame 給它,會自動依 index 對齊\n", "\n", "This means that keeping the context of data and combining data from different sources–both potentially error-prone tasks with raw NumPy arrays–become essentially foolproof ones with Pandas.\n", "We will additionally see that there are well-defined operations between one-dimensional ``Series`` structures and two-dimensional ``DataFrame`` structures." @@ -80,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -90,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -103,21 +92,108 @@ "dtype: int32" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rng = np.random.RandomState(42) named # 42 是 seed, 取得產生器 rng.\n", - "ser = pd.Series(rng.randint(0, 10, 4)) # ser 就是個 fancy array\n", + "rng = np.random.RandomState(42) # 42 是 seed, 取得產生器 rng.\n", + "ser = pd.Series(rng.randint(0, 10, 4)) # ser 就是個 Series, 0-10 之間取 4 個亂數\n", "ser" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "複習一下指定 columns 與 index\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
California4239673833252190.00000093.000000
Florida
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
A3173
B1559
C3519
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "A 3 1 7 3\n", + "B 1 5 5 9\n", + "C 3 5 1 9" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 複習一下指定 columns 與 index \n", + "df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'], index=['A', 'B', 'C'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -149,24 +225,24 @@ " \n", " \n", " 0\n", - " 6\n", + " 1\n", " 9\n", - " 2\n", - " 6\n", + " 3\n", + " 7\n", " \n", " \n", " 1\n", + " 6\n", + " 8\n", " 7\n", " 4\n", - " 3\n", - " 7\n", " \n", " \n", " 2\n", - " 7\n", - " 2\n", - " 5\n", + " 1\n", " 4\n", + " 7\n", + " 9\n", " \n", " \n", "\n", @@ -174,12 +250,12 @@ ], "text/plain": [ " A B C D\n", - "0 6 9 2 6\n", - "1 7 4 3 7\n", - "2 7 2 5 4" + "0 1 9 3 7\n", + "1 6 8 7 4\n", + "2 1 4 7 9" ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -199,19 +275,22 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 6\n", - "1 3\n", - "2 7\n", - "3 4\n", - "dtype: int32\n" - ] + "data": { + "text/plain": [ + "0 6\n", + "1 3\n", + "2 7\n", + "3 4\n", + "dtype: int32" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" }, { "data": { @@ -223,14 +302,23 @@ "dtype: float64" ] }, - "execution_count": 7, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "結果還是個 pd.Series\n" + ] } ], "source": [ - "%f ser . cr\n", - "np.exp(ser)" + "ser\n", + "np.exp(ser)\n", + "%f _ type . cr \\ 結果還是個 pd.Series" ] }, { @@ -242,18 +330,72 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 20, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " A B C D\n", - "0 6 9 2 6\n", - "1 7 4 3 7\n", - "2 7 2 5 4\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
01937
16874
21479
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 1 9 3 7\n", + "1 6 8 7 4\n", + "2 1 4 7 9" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" }, { "data": { @@ -285,24 +427,24 @@ " \n", " \n", " 0\n", - " -1.000000\n", + " 0.707107\n", " 7.071068e-01\n", - " 1.000000\n", - " -1.000000e+00\n", + " 0.707107\n", + " -7.071068e-01\n", " \n", " \n", " 1\n", + " -1.000000\n", + " -2.449294e-16\n", " -0.707107\n", " 1.224647e-16\n", - " 0.707107\n", - " -7.071068e-01\n", " \n", " \n", " 2\n", - " -0.707107\n", - " 1.000000e+00\n", - " -0.707107\n", + " 0.707107\n", " 1.224647e-16\n", + " -0.707107\n", + " 7.071068e-01\n", " \n", " \n", "\n", @@ -310,19 +452,28 @@ ], "text/plain": [ " A B C D\n", - "0 -1.000000 7.071068e-01 1.000000 -1.000000e+00\n", - "1 -0.707107 1.224647e-16 0.707107 -7.071068e-01\n", - "2 -0.707107 1.000000e+00 -0.707107 1.224647e-16" + "0 0.707107 7.071068e-01 0.707107 -7.071068e-01\n", + "1 -1.000000 -2.449294e-16 -0.707107 1.224647e-16\n", + "2 0.707107 1.224647e-16 -0.707107 7.071068e-01" ] }, - "execution_count": 10, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "結果仍然是個 DataFrame, 用法就是把 pd.df 當成一個變數來用就對了。\n" + ] } ], "source": [ - "%f df . cr\n", - "np.sin(df * np.pi / 4)" + "df\n", + "np.sin(df * np.pi / 4)\n", + "%f _ type . cr \\ 結果仍然是個 DataFrame, 用法就是把 pd.df 當成一個變數來用就對了。" ] }, { @@ -403,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -412,18 +563,27 @@ "Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_ type tib. \\ ==> ()\n", + "結果還是 pd.Index\n" + ] } ], "source": [ - "area.index | population.index" + "area.index | population.index\n", + "%f _ type tib. \\ 結果還是 pd.Index " ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -431,7 +591,7 @@ "output_type": "stream", "text": [ "我都還不知道 set 運算可以這樣做\n", - "{1, 2, 3, 4, 5}\n", + "s1 s2 OR tib. \\ ==> {1, 2, 3, 4, 5} ()\n", "peforth 的 OR 就是 python 的 bitwise | 運算\n", "如過這樣 list(s1) | list(s2) 就是 TypeError: unsupported operand types\n" ] @@ -442,7 +602,7 @@ "{1, 2, 3, 4, 5}" ] }, - "execution_count": 24, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -451,8 +611,8 @@ "%f \\ 我都還不知道 set 運算可以這樣做\n", "s1 = {1,2,4}\n", "s2 = {1,3,5}\n", - "%f s1 s2 OR . cr \\ peforth 的 OR 就是 python 的 bitwise | 運算\n", - "%f \\ 如過這樣 list(s1) | list(s2) 就是 TypeError: unsupported operand types\n", + "%f s1 s2 OR tib. \\ peforth 的 OR 就是 python 的 bitwise | 運算\n", + "%f \\ 如果這樣 list(s1) | list(s2) 就是 TypeError: unsupported operand types\n", "s1 | s2" ] }, @@ -533,7 +693,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -564,25 +724,25 @@ " \n", " \n", " 0\n", - " 1\n", - " 11\n", + " 12\n", + " 8\n", " \n", " \n", " 1\n", - " 5\n", - " 1\n", + " 14\n", + " 12\n", " \n", " \n", "\n", "
" ], "text/plain": [ - " A B\n", - "0 1 11\n", - "1 5 1" + " A B\n", + "0 12 8\n", + "1 14 12" ] }, - "execution_count": 27, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -595,7 +755,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -627,21 +787,21 @@ " \n", " \n", " 0\n", - " 4\n", " 0\n", - " 9\n", + " 8\n", + " 6\n", " \n", " \n", " 1\n", - " 5\n", " 8\n", + " 7\n", " 0\n", " \n", " \n", " 2\n", - " 9\n", + " 7\n", + " 7\n", " 2\n", - " 6\n", " \n", " \n", "\n", @@ -649,12 +809,12 @@ ], "text/plain": [ " B A C\n", - "0 4 0 9\n", - "1 5 8 0\n", - "2 9 2 6" + "0 0 8 6\n", + "1 8 7 0\n", + "2 7 7 2" ] }, - "execution_count": 28, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -667,7 +827,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -699,14 +859,14 @@ " \n", " \n", " 0\n", - " 1.0\n", - " 15.0\n", + " 20.0\n", + " 8.0\n", " NaN\n", " \n", " \n", " 1\n", - " 13.0\n", - " 6.0\n", + " 21.0\n", + " 20.0\n", " NaN\n", " \n", " \n", @@ -721,12 +881,12 @@ ], "text/plain": [ " A B C\n", - "0 1.0 15.0 NaN\n", - "1 13.0 6.0 NaN\n", + "0 20.0 8.0 NaN\n", + "1 21.0 20.0 NaN\n", "2 NaN NaN NaN" ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -744,6 +904,58 @@ "Here we'll fill with the mean of all values in ``A`` (computed by first stacking the rows of ``A``):" ] }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A type tib. \\ ==> ()\n" + ] + }, + { + "data": { + "text/plain": [ + "0 A 12\n", + " B 8\n", + "1 A 14\n", + " B 12\n", + "dtype: int32" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_ type tib. \\ ==> ()\n", + ".stack() 把原來的 data-frame 疊成 Series\n" + ] + }, + { + "data": { + "text/plain": [ + "11.5" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f A type tib.\n", + "A.stack()\n", + "%f _ type tib. \\ .stack() 把原來的 data-frame 疊成 Series\n", + "A.stack().mean()" + ] + }, { "cell_type": "code", "execution_count": 30, @@ -845,18 +1057,18 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[3, 8, 2, 4],\n", - " [2, 6, 4, 8],\n", - " [6, 1, 3, 8]])" + "array([[0, 7, 2, 2],\n", + " [0, 4, 9, 6],\n", + " [9, 8, 6, 8]])" ] }, - "execution_count": 31, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -868,24 +1080,32 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 0, 0, 0],\n", - " [-1, -2, 2, 4],\n", - " [ 3, -7, 1, 4]])" + " [ 0, -3, 7, 4],\n", + " [ 9, 1, 4, 6]])" ] }, - "execution_count": 32, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2D - 1D 這是 broadcasting\n" + ] } ], "source": [ - "A - A[0]" + "A - A[0] \n", + "%f \\ 2D - 1D 這是 broadcasting " ] }, { @@ -899,77 +1119,31 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 41, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
QRST
00000
1-1-224
23-714
\n", - "
" - ], - "text/plain": [ - " Q R S T\n", - "0 0 0 0 0\n", - "1 -1 -2 2 4\n", - "2 3 -7 1 4" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " Q R S T\n", + "0 0 7 2 2\n", + "1 0 4 9 6\n", + "2 9 8 6 8\n", + "原來的 data-frame\n", + "\n", + " Q R S T\n", + "0 0 0 0 0\n", + "1 0 -3 7 4\n", + "2 9 1 4 6\n", + "相減\n" + ] } ], "source": [ "df = pd.DataFrame(A, columns=list('QRST'))\n", - "df - df.iloc[0]" + "%f df . cr \\ 原來的 data-frame\n", + "%f cr df df :> iloc[0] - . cr \\ 相減" ] }, { @@ -981,7 +1155,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -989,13 +1163,13 @@ "output_type": "stream", "text": [ " Q R S T\n", - "0 3 8 2 4\n", - "1 2 6 4 8\n", - "2 6 1 3 8\n", + "0 0 7 2 2\n", + "1 0 4 9 6\n", + "2 9 8 6 8\n", "\n", - "0 8\n", - "1 6\n", - "2 1\n", + "0 7\n", + "1 4\n", + "2 8\n", "Name: R, dtype: int32\n" ] }, @@ -1029,24 +1203,24 @@ " \n", " \n", " 0\n", - " -5\n", + " -7\n", " 0\n", - " -6\n", - " -4\n", + " -5\n", + " -5\n", " \n", " \n", " 1\n", " -4\n", " 0\n", - " -2\n", + " 5\n", " 2\n", " \n", " \n", " 2\n", - " 5\n", + " 1\n", + " 0\n", + " -2\n", " 0\n", - " 2\n", - " 7\n", " \n", " \n", "\n", @@ -1054,12 +1228,12 @@ ], "text/plain": [ " Q R S T\n", - "0 -5 0 -6 -4\n", - "1 -4 0 -2 2\n", - "2 5 0 2 7" + "0 -7 0 -5 -5\n", + "1 -4 0 5 2\n", + "2 1 0 -2 0" ] }, - "execution_count": 38, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -1079,30 +1253,31 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Q 3\n", + "Q 0\n", "S 2\n", "Name: 0, dtype: int32" ] }, - "execution_count": 39, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "halfrow = df.iloc[0, ::2]\n", + "halfrow = df.iloc[0, ::2] # 第一個 : 前後是空的,表示【所有的 columns】, 第二個 : 之後的 2 表示 'every other one' \n", + " # 結果就是 row 0, column 0 2 4 . . . etc\n", "halfrow" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -1110,11 +1285,10 @@ "output_type": "stream", "text": [ " Q R S T\n", - "0 3 8 2 4\n", - "1 2 6 4 8\n", - "2 6 1 3 8\n", - "\n", - "只針對 Q S 兩欄做 broadcasting, 所以 R T 欄都變成 NaN\n" + "0 0 7 2 2\n", + "1 0 4 9 6\n", + "2 9 8 6 8\n", + "以上是 df, halfrow 就在上一 cell\n" ] }, { @@ -1154,16 +1328,16 @@ " \n", " \n", " 1\n", - " -1.0\n", + " 0.0\n", " NaN\n", - " 2.0\n", + " 7.0\n", " NaN\n", " \n", " \n", " 2\n", - " 3.0\n", + " 9.0\n", " NaN\n", - " 1.0\n", + " 4.0\n", " NaN\n", " \n", " \n", @@ -1173,19 +1347,26 @@ "text/plain": [ " Q R S T\n", "0 0.0 NaN 0.0 NaN\n", - "1 -1.0 NaN 2.0 NaN\n", - "2 3.0 NaN 1.0 NaN" + "1 0.0 NaN 7.0 NaN\n", + "2 9.0 NaN 4.0 NaN" ] }, - "execution_count": 44, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "只針對 Q S 兩欄做 broadcasting, 所以 R T 欄都變成 NaN\n" + ] } ], "source": [ - "%f df . cr cr\n", - "%f \\ 只針對 Q S 兩欄做 broadcasting, 所以 R T 欄都變成 NaN\n", - "df - halfrow\n" + "%f df . cr \\ 以上是 df, halfrow 就在上一 cell \n", + "df - halfrow\n", + "%f \\ 只針對 Q S 兩欄做 broadcasting, 所以 R T 欄都變成 NaN" ] }, { diff --git a/notebooks/03.04-Missing-Values.ipynb b/notebooks/03.04-Missing-Values.ipynb index 1e1cbbf32..c4542d3f6 100644 --- a/notebooks/03.04-Missing-Values.ipynb +++ b/notebooks/03.04-Missing-Values.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "scrolled": false + }, "outputs": [ { "name": "stdout", @@ -15,18 +17,7 @@ } ], "source": [ - "import peforth\n", - "peforth.dictate(r'''\n", - "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", - ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", - " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", - " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", - " /// here after, when FORTH come accross an unknown token, instead of alerting \n", - " /// it try to find the token in python __main__ module name space.\n", - "\n", - "\\ Redefine \\ command to print the comment line\n", - "code \\ print(nexttoken('\\n')) end-code \n", - "''');\n" + "%run peforth.ipynb" ] }, { @@ -1479,7 +1470,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.6.0" } }, "nbformat": 4, diff --git a/notebooks/03.05-Hierarchical-Indexing.ipynb b/notebooks/03.05-Hierarchical-Indexing.ipynb index af0b51211..3bd18dfe4 100644 --- a/notebooks/03.05-Hierarchical-Indexing.ipynb +++ b/notebooks/03.05-Hierarchical-Indexing.ipynb @@ -15,18 +15,7 @@ } ], "source": [ - "import peforth\n", - "peforth.dictate(r'''\n", - "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", - ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", - " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", - " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", - " /// here after, when FORTH come accross an unknown token, instead of alerting \n", - " /// it try to find the token in python __main__ module name space.\n", - "\n", - "\\ Redefine \\ command to print the comment line\n", - "code \\ print(nexttoken('\\n')) end-code \n", - "''');\n" + "%run peforth.ipynb" ] }, { diff --git a/notebooks/03.06-Concat-And-Append.ipynb b/notebooks/03.06-Concat-And-Append.ipynb index 80131e944..e23a28f30 100644 --- a/notebooks/03.06-Concat-And-Append.ipynb +++ b/notebooks/03.06-Concat-And-Append.ipynb @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -87,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -146,7 +146,7 @@ "2 A2 B2 C2" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -156,7 +156,7 @@ " \"\"\"Quickly make a DataFrame\"\"\"\n", " data = {c: [str(c) + str(i) for i in ind]\n", " for c in cols}\n", - " return pd.DataFrame(data, ind)\n", + " return pd.DataFrame(data, ind) # ind 即 index, 可用 range, list, tuple, or set <-- 自動排序 \n", "\n", "# example DataFrame\n", "make_df('ABC', range(3))" @@ -173,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -208,7 +208,7 @@ "source": [ "## Recall: Concatenation of NumPy Arrays\n", "\n", - "See my Ynote `\"_matrix_ _array_ _list_ 相加 合併 add merge\"`, python list 本來就可以用 + 的。\n", + "Search my Ynote `\"_matrix_ _array_ _list_ 相加 合併 add merge\"`, python list 本來就可以用 + 的。\n", "\n", "Concatenation of ``Series`` and ``DataFrame`` objects is very similar to concatenation of Numpy arrays, which can be done via the ``np.concatenate`` function as discussed in [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb).\n", "Recall that with it, you can combine the contents of two or more arrays into a single array:" @@ -216,24 +216,24 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([1, 2, 3, 4, 5, 6, 7, 8, 9])" + "array([7, 8, 9, 4, 5, 6, 1, 2, 3])" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "x = [1, 2, 3]\n", + "x = [7, 8, 9]\n", "y = [4, 5, 6]\n", - "z = [7, 8, 9]\n", + "z = [1, 2, 3]\n", "np.concatenate([x, y, z])" ] }, @@ -247,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -264,7 +264,7 @@ " [3, 4, 3, 4]])" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -278,14 +278,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "這個比較平凡,類似「串聯」\n" + "這個比較平凡,類似「串聯」, default axis=0\n" ] }, { @@ -297,23 +297,23 @@ " [3, 4]])" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%f \\ 這個比較平凡,類似「串聯」\n", + "%f \\ 這個比較平凡,類似「串聯」, default axis=0\n", "x = [[1, 2],\n", " [3, 4]]\n", - "np.concatenate([x, x], axis=0) # df.dropna(axis='rows') 好像只有 pandas 才有" + "np.concatenate([x, x]) # df.dropna(axis='rows') 好像只有 pandas 才有" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Simple Concatenation with ``pd.concat``" + "## Simple Concatenation with ``pd.concat`` (上面的是 `*np*.concatenate`)" ] }, { @@ -334,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -349,7 +349,7 @@ "dtype: object" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -364,14 +364,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "我猜 pd.concat() 為何要用 list `[ser1,ser2]` 的形式,啊!很簡單,它後面還有很多別的 arguments.\n", + "Operand 要用 list `[ser1,ser2]` 的形式,很簡單,它後面還有很多別的 arguments.\n", "\n", "It also works to concatenate higher-dimensional objects, such as ``DataFrame``s:" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -521,7 +521,7 @@ "4 A4 B4" ] }, - "execution_count": 19, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -543,7 +543,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "metadata": { "scrolled": false }, @@ -689,7 +689,7 @@ "1 A1 B1 C1 D1" ] }, - "execution_count": 18, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -700,6 +700,181 @@ "display('df3', 'df4', \"pd.concat([df3, df4], axis='columns')\") # axis=1" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "故意把 index 錯開,結果像這樣:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "

df3

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
0A0B0
1A1B1
\n", + "
\n", + "
\n", + "
\n", + "

df4

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CD
0C0D0
3C3D3
\n", + "
\n", + "
\n", + "
\n", + "

pd.concat([df3, df4], axis='columns')

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0A0B0C0D0
1A1B1NaNNaN
3NaNNaNC3D3
\n", + "
\n", + "
" + ], + "text/plain": [ + "df3\n", + " A B\n", + "0 A0 B0\n", + "1 A1 B1\n", + "\n", + "df4\n", + " C D\n", + "0 C0 D0\n", + "3 C3 D3\n", + "\n", + "pd.concat([df3, df4], axis='columns')\n", + " A B C D\n", + "0 A0 B0 C0 D0\n", + "1 A1 B1 NaN NaN\n", + "3 NaN NaN C3 D3" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f \\ 故意把 index 錯開,結果像這樣:\n", + "df3 = make_df('AB', [0, 1])\n", + "df4 = make_df('CD', [0, 3])\n", + "display('df3', 'df4', \"pd.concat([df3, df4], axis='columns')\") # axis=1" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -719,7 +894,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -869,7 +1044,7 @@ "1 A3 B3" ] }, - "execution_count": 8, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -898,12 +1073,14 @@ "\n", "If you'd like to simply verify that the indices in the result of ``pd.concat()`` do not overlap, you can specify the ``verify_integrity`` flag.\n", "With this set to True, the concatenation will raise an exception if there are duplicate indices.\n", - "Here is an example, where for clarity we'll catch and print the error message:" + "Here is an example, where for clarity we'll catch and print the error message:\n", + "\n", + "**verify_integrity** 想像不到是用來抓 index 重複的,可能也抓別的問題,故然。" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -934,7 +1111,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1084,7 +1261,7 @@ "3 A3 B3" ] }, - "execution_count": 10, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1104,7 +1281,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1257,7 +1434,7 @@ " 1 A3 B3" ] }, - "execution_count": 11, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1288,7 +1465,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1454,7 +1631,7 @@ "4 NaN B4 C4 D4" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1467,7 +1644,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1633,7 +1810,7 @@ "4 NaN B4 C4 D4" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1655,7 +1832,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1811,7 +1988,7 @@ "4 B4 C4" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1831,7 +2008,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1839,6 +2016,19 @@ "text/html": [ "
\n", "

df5

\n", + "\n", "\n", " \n", " \n", @@ -1867,6 +2057,19 @@ " \n", "
\n", "

df6

\n", + "\n", "
\n", " \n", " \n", @@ -1878,10 +2081,10 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1895,6 +2098,19 @@ " \n", "
\n", "

pd.concat([df5, df6], join_axes=[df5.columns])

\n", + "\n", "
3B3C3D32B2C2D2
4
\n", " \n", " \n", @@ -1918,10 +2134,10 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1942,18 +2158,18 @@ "\n", "df6\n", " B C D\n", - "3 B3 C3 D3\n", + "2 B2 C2 D2\n", "4 B4 C4 D4\n", "\n", "pd.concat([df5, df6], join_axes=[df5.columns])\n", " A B C\n", "1 A1 B1 C1\n", "2 A2 B2 C2\n", - "3 NaN B3 C3\n", + "2 NaN B2 C2\n", "4 NaN B4 C4" ] }, - "execution_count": 15, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1982,7 +2198,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -2132,7 +2348,7 @@ "4 A4 B4" ] }, - "execution_count": 24, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" }, @@ -2141,14 +2357,14 @@ "output_type": "stream", "text": [ "注意看! 經過 df1.append(df2) 之後, df1 並沒有被改掉! 照直覺,應該是 append 到 df1 之後,\n", - "結果 pandas 不是這麼想的。所以效率要緊時,還是用 pd.concat 才好。\n" + "結果 pandas 不是這麼想的。看 df.append? help 就知道它專只沿 row 做 append. 另有 df.join() 才是用來合併 columns 的,下一節會談到。\n" ] } ], "source": [ "display('df1.append(df2)','df1', 'df2')\n", "%f \\ 注意看! 經過 df1.append(df2) 之後, df1 並沒有被改掉! 照直覺,應該是 append 到 df1 之後,\n", - "%f \\ 結果 pandas 不是這麼想的。所以效率要緊時,還是用 pd.concat 才好。\n" + "%f \\ 結果 pandas 不是這麼想的。看 df.append? help 就知道它專只沿 row 做 append. 另有 df.join() 才是用來合併 columns 的,下一節會談到。\n" ] }, { diff --git a/notebooks/03.07-Merge-and-Join.ipynb b/notebooks/03.07-Merge-and-Join.ipynb index c5de2c797..4b929a6df 100644 --- a/notebooks/03.07-Merge-and-Join.ipynb +++ b/notebooks/03.07-Merge-and-Join.ipynb @@ -15,18 +15,7 @@ } ], "source": [ - "import peforth\n", - "peforth.dictate(r'''\n", - "\\ Now we redefine the 'unknown' command that was do-nothing by default\n", - ": unknown ( token -- thing Y|N) // Try to find the unknown token in __main__\n", - " py> getattr(sys.modules['__main__'],pop(),\"Ûnknôwn\") \n", - " py> str(tos())==\"Ûnknôwn\" if drop false else true then ;\n", - " /// here after, when FORTH come accross an unknown token, instead of alerting \n", - " /// it try to find the token in python __main__ module name space.\n", - "\n", - "\\ Redefine \\ command to print the comment line\n", - "code \\ print(nexttoken('\\n')) end-code \n", - "''');\n" + "%run peforth.ipynb" ] }, { @@ -63,7 +52,9 @@ "If you have ever worked with databases, you should be familiar with this type of data interaction.\n", "The main interface for this is the ``pd.merge`` function, and we'll see few examples of how this can work in practice.\n", "\n", - "For convenience, we will start by redefining the ``display()`` functionality from the previous section:" + "For convenience, we will start by redefining the ``display()`` functionality from the previous section:\n", + "\n", + "[My Ynote: '用來在 jupyter notebook 下漂亮顯示多個 DataFrames 的 class 設計'](http://note.youdao.com/noteshare?id=6d20baa6d72e1630338d604c03375bfe)" ] }, { @@ -129,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -248,7 +239,7 @@ "3 Sue 2014" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -270,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -336,7 +327,7 @@ "3 Sue HR 2014" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -374,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -557,7 +548,7 @@ "3 Sue HR 2014 Steve" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, @@ -604,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -823,7 +814,7 @@ "7 Sue HR organization" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1039,7 +1030,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -1047,6 +1038,19 @@ "text/html": [ "
\n", "

df1

\n", + "\n", "
C2
32NaNB3C3B2C2
4
\n", " \n", " \n", @@ -1082,6 +1086,19 @@ " \n", "
\n", "

df3

\n", + "\n", "
\n", " \n", " \n", @@ -1117,6 +1134,19 @@ " \n", "
\n", "

pd.merge(df1, df3, left_on=\"employee\", right_on=\"name\")

\n", + "\n", "
\n", " \n", " \n", @@ -1184,7 +1214,7 @@ "3 Sue HR Sue 90000" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1199,18 +1229,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The result has a redundant column that we can drop if desired–for example, by using the ``drop()`` method of ``DataFrame``s:" + "The result has a redundant column that we can drop if desired – for example, by using the ``drop()`` method of ``DataFrame``s:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1257,7 +1300,7 @@ "3 Sue HR 90000" ] }, - "execution_count": 8, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1278,14 +1321,131 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 32, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df1a type tib. \\ ==> ()\n", + ".set_index() 之後還是 data-frame\n" + ] + }, { "data": { "text/html": [ + "
\n", + "

df1

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
employeegroup
0BobAccounting
1JakeEngineering
2LisaEngineering
3SueHR
\n", + "
\n", + "
\n", + "
\n", + "

df2

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
employeehire_date
0Lisa2004
1Bob2008
2Jake2012
3Sue2014
\n", + "
\n", + "
\n", "
\n", "

df1a

\n", + "\n", "\n", " \n", " \n", @@ -1320,6 +1480,19 @@ " \n", "
\n", "

df2a

\n", + "\n", "
\n", " \n", " \n", @@ -1354,6 +1527,20 @@ " " ], "text/plain": [ + "df1\n", + " employee group\n", + "0 Bob Accounting\n", + "1 Jake Engineering\n", + "2 Lisa Engineering\n", + "3 Sue HR\n", + "\n", + "df2\n", + " employee hire_date\n", + "0 Lisa 2004\n", + "1 Bob 2008\n", + "2 Jake 2012\n", + "3 Sue 2014\n", + "\n", "df1a\n", " group\n", "employee \n", @@ -1371,7 +1558,7 @@ "Sue 2014" ] }, - "execution_count": 9, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1379,7 +1566,8 @@ "source": [ "df1a = df1.set_index('employee')\n", "df2a = df2.set_index('employee')\n", - "display('df1a', 'df2a')" + "%f df1a type tib. \\ .set_index() 之後還是 data-frame \n", + "display('df1','df2','df1a', 'df2a')" ] }, { @@ -1391,14 +1579,34 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 27, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "因為沒有 common column 所以 pd.merge 要指定用 index 否則 display('df1a', 'df2a', \"pd.merge(df1a, df2a)\") 結果是: MergeError: No common columns to perform merge on\n" + ] + }, { "data": { "text/html": [ "
\n", "

df1a

\n", + "\n", "
\n", " \n", " \n", @@ -1433,6 +1641,19 @@ " \n", "
\n", "

df2a

\n", + "\n", "
\n", " \n", " \n", @@ -1467,9 +1688,22 @@ " \n", "
\n", "

pd.merge(df1a, df2a, left_index=True, right_index=True)

\n", - "
\n", - " \n", - " \n", + "\n", + "
\n", + " \n", + " \n", " \n", " \n", " \n", @@ -1482,11 +1716,6 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -1497,6 +1726,11 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1526,20 +1760,109 @@ "pd.merge(df1a, df2a, left_index=True, right_index=True)\n", " group hire_date\n", "employee \n", - "Lisa Engineering 2004\n", "Bob Accounting 2008\n", "Jake Engineering 2012\n", + "Lisa Engineering 2004\n", "Sue HR 2014" ] }, - "execution_count": 10, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "display('df1a', 'df2a',\n", - " \"pd.merge(df1a, df2a, left_index=True, right_index=True)\")" + "%f \\ 因為沒有 common column 所以 pd.merge 要指定用 index 否則 display('df1a', 'df2a', \"pd.merge(df1a, df2a)\") 結果是: MergeError: No common columns to perform merge on\n", + "display('df1a', 'df2a', \"pd.merge(df1a, df2a, left_index=True, right_index=True)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "既然已經用 df.set_index() 弄好了 index, 用 pd.concat() 也可以\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
grouphire_date
LisaEngineering2004
BobAccounting20082012
LisaEngineering2004
SueHR2014
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
grouphire_date
BobAccounting2008
JakeEngineering2012
LisaEngineering2004
SueHR2014
\n", + "
" + ], + "text/plain": [ + " group hire_date\n", + "Bob Accounting 2008\n", + "Jake Engineering 2012\n", + "Lisa Engineering 2004\n", + "Sue HR 2014" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_ type tib. \\ ==> ()\n", + "結果很類似,index 的 name 不見了,但還是 data-frame\n" + ] + } + ], + "source": [ + "%f \\ 既然已經用 df.set_index() 弄好了 index, 用 pd.concat() 也可以\n", + "pd.concat([df1a, df2a], axis='columns')\n", + "%f _ type tib. \\ 結果很類似,index 的 name 不見了,但還是 data-frame " ] }, { @@ -1552,7 +1875,9 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -1701,6 +2026,115 @@ "display('df1a', 'df2a', 'df1a.join(df2a)')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "複習前面介紹的 df.append() 是用來沿 row 延伸的,這裡的 df.join() 則是沿 column 合併。" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
grouphire_date
employee
BobAccountingNaN
JakeEngineeringNaN
LisaEngineeringNaN
SueHRNaN
LisaNaN2004.0
BobNaN2008.0
JakeNaN2012.0
SueNaN2014.0
\n", + "
" + ], + "text/plain": [ + " group hire_date\n", + "employee \n", + "Bob Accounting NaN\n", + "Jake Engineering NaN\n", + "Lisa Engineering NaN\n", + "Sue HR NaN\n", + "Lisa NaN 2004.0\n", + "Bob NaN 2008.0\n", + "Jake NaN 2012.0\n", + "Sue NaN 2014.0" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1a.append(df2a)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1884,7 +2318,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -1892,6 +2326,19 @@ "text/html": [ "
\n", "

df6

\n", + "\n", "\n", " \n", " \n", @@ -1922,6 +2369,19 @@ " \n", "
\n", "

df7

\n", + "\n", "
\n", " \n", " \n", @@ -1947,6 +2407,19 @@ " \n", "
\n", "

pd.merge(df6, df7)

\n", + "\n", "
\n", " \n", " \n", @@ -1985,7 +2458,7 @@ "0 Mary bread wine" ] }, - "execution_count": 13, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -2354,7 +2827,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -2362,6 +2835,19 @@ "text/html": [ "
\n", "

df8

\n", + "\n", "
\n", " \n", " \n", @@ -2397,6 +2883,19 @@ " \n", "
\n", "

df9

\n", + "\n", "
\n", " \n", " \n", @@ -2409,22 +2908,22 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
0Bob31
1Jake12
2Lisa43
3Sue24
\n", @@ -2432,6 +2931,19 @@ "
\n", "
\n", "

pd.merge(df8, df9, on=\"name\")

\n", + "\n", "\n", " \n", " \n", @@ -2446,25 +2958,25 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
0Bob131
1Jake212
2Lisa343
3Sue424
\n", @@ -2481,20 +2993,20 @@ "\n", "df9\n", " name rank\n", - "0 Bob 3\n", - "1 Jake 1\n", - "2 Lisa 4\n", - "3 Sue 2\n", + "0 Bob 1\n", + "1 Jake 2\n", + "2 Lisa 3\n", + "3 Sue 4\n", "\n", "pd.merge(df8, df9, on=\"name\")\n", " name rank_x rank_y\n", - "0 Bob 1 3\n", - "1 Jake 2 1\n", - "2 Lisa 3 4\n", - "3 Sue 4 2" + "0 Bob 1 1\n", + "1 Jake 2 2\n", + "2 Lisa 3 3\n", + "3 Sue 4 4" ] }, - "execution_count": 17, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -2503,7 +3015,7 @@ "df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],\n", " 'rank': [1, 2, 3, 4]})\n", "df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],\n", - " 'rank': [3, 1, 4, 2]})\n", + " 'rank': [1, 2, 3, 4]}) # 我故意把值改成都一樣,即使如此也應該分開看待...對的!\n", "display('df8', 'df9', 'pd.merge(df8, df9, on=\"name\")')" ] }, @@ -2517,7 +3029,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -2525,6 +3037,19 @@ "text/html": [ "
\n", "

df8

\n", + "\n", "\n", " \n", " \n", @@ -2560,6 +3085,19 @@ " \n", "
\n", "

df9

\n", + "\n", "
\n", " \n", " \n", @@ -2572,36 +3110,49 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
0Bob31
1Jake12
2Lisa43
3Sue24
\n", "
\n", "
\n", "
\n", - "

pd.merge(df8, df9, on=\"name\", suffixes=[\"_L\", \"_R\"])

\n", + "

pd.merge(df8, df9, on=\"name\", suffixes=[\"_P\", \"_Q\"])

\n", + "\n", "\n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2609,25 +3160,25 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
namerank_Lrank_Rrank_Prank_Q
0Bob131
1Jake212
2Lisa343
3Sue424
\n", @@ -2644,26 +3195,38 @@ "\n", "df9\n", " name rank\n", - "0 Bob 3\n", - "1 Jake 1\n", - "2 Lisa 4\n", - "3 Sue 2\n", - "\n", - "pd.merge(df8, df9, on=\"name\", suffixes=[\"_L\", \"_R\"])\n", - " name rank_L rank_R\n", - "0 Bob 1 3\n", - "1 Jake 2 1\n", - "2 Lisa 3 4\n", - "3 Sue 4 2" + "0 Bob 1\n", + "1 Jake 2\n", + "2 Lisa 3\n", + "3 Sue 4\n", + "\n", + "pd.merge(df8, df9, on=\"name\", suffixes=[\"_P\", \"_Q\"])\n", + " name rank_P rank_Q\n", + "0 Bob 1 1\n", + "1 Jake 2 2\n", + "2 Lisa 3 3\n", + "3 Sue 4 4" ] }, - "execution_count": 18, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "suffixes=[\"_L\", \"_R\",\"_P\", \"_Q\"] 多給了還不行!\n", + "會出錯: ValueError: too many values to unpack (expected 2)\n", + "仔細想想也有道哩,此處還是嚴格一點的好。\n" + ] } ], "source": [ - "display('df8', 'df9', 'pd.merge(df8, df9, on=\"name\", suffixes=[\"_L\", \"_R\"])')" + "display('df8', 'df9', 'pd.merge(df8, df9, on=\"name\", suffixes=[\"_P\", \"_Q\"])')\n", + "%f \\ suffixes=[\"_L\", \"_R\",\"_P\", \"_Q\"] 多給了還不行!\n", + "%f \\ 會出錯: ValueError: too many values to unpack (expected 2)\n", + "%f \\ 仔細想想也有道哩,此處還是嚴格一點的好。 \n" ] }, { @@ -2713,7 +3276,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -2721,6 +3284,19 @@ "text/html": [ "
\n", "

pop.head()

\n", + "\n", "\n", " \n", " \n", @@ -2773,6 +3349,19 @@ " \n", "
\n", "

areas.head()

\n", + "\n", "
\n", " \n", " \n", @@ -2813,6 +3402,19 @@ " \n", "
\n", "

abbrevs.head()

\n", + "\n", "
\n", " \n", " \n", @@ -2878,7 +3480,7 @@ "4 California CA" ] }, - "execution_count": 20, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -2985,7 +3587,7 @@ "source": [ "merged = pd.merge(pop, abbrevs, how='outer',\n", " left_on='state/region', right_on='abbreviation')\n", - "merged = merged.drop('abbreviation', 1) # drop duplicate info\n", + "merged = merged.drop('abbreviation', 1) # drop duplicate info, where 1 is axis=1\n", "merged.head()" ] }, diff --git a/notebooks/Index.ipynb b/notebooks/Index.ipynb index a368faa84..5e73aa4e9 100644 --- a/notebooks/Index.ipynb +++ b/notebooks/Index.ipynb @@ -123,9 +123,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/Playground.ipynb b/notebooks/Playground.ipynb index aa5b2c329..313a55734 100644 --- a/notebooks/Playground.ipynb +++ b/notebooks/Playground.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -49,37 +49,18 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "UsageError: Missing filename, URL, input history range, macro, or element in the user namespace.\n" - ] - } - ], + "outputs": [], "source": [ "%load " ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'a': ['a0', 'a1', 'a2'], 'b': ['b0', 'b1', 'b2'], 'c': ['c0', 'c1', 'c2']}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "cols = \"abc\"\n", "ind = range(3)\n", @@ -90,40 +71,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'a': 11, 'b': 11, 'c': 11}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "{c:11 for c in cols}" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " {c for c in \"abc\"} tib. \\ ==> {'b', 'a', 'c'} ()\n", - " [c for c in \"abc\"] tib. \\ ==> ['a', 'b', 'c'] ()\n", - " (c for c in \"abc\") tib. \\ ==> . at 0x000001E8DC926048> ()\n", - " (c for c in \"abc\") py> list(pop()) tib. \\ ==> ['a', 'b', 'c'] ()\n" - ] - } - ], + "outputs": [], "source": [ "%f {c for c in \"abc\"} tib.\n", "%f [c for c in \"abc\"] tib.\n", @@ -134,265 +93,707 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x = [12,34]\n", + "%f x . cr\n", + "[x]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x = np.array([12,34])\n", + "x\n", + "[x[:,np.newaxis], [91,92]]\n", + "x[:,np.newaxis][:,np.newaxis]\n", + "x[:,np.newaxis][:,np.newaxis][:,np.newaxis]\n", + "%f \\ --------------------------\n", + "[x]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x = [12,34]\n", + "x\n", + "x[:,np.newaxis]\n", + "x[:,np.newaxis][:,np.newaxis]\n", + "x[:,np.newaxis][:,np.newaxis][:,np.newaxis]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 憑印象 try mask \n", + "%f \\ 普通 array 沒有這種能力,這叫 numpy 的啥能力.... 稱作 ufunc (Universal Functions)\n", + "a = [c for c in \"abcdef\"] \n", + "a\n", + "try:\n", + " a > 'd'\n", + "except TypeError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 憑印象 try numpy array 才有的 mask \n", + "a = np.array([c for c in \"abcdef\"])\n", + "a < 'd'\n", + "a[a < 'd']\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### WKS 主管餐廳菜單\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%run peforth.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = {c: [str(c) + str(i) for i in range(3)]\n", + " for c in \"ABC\"}\n", + "data\n", + "%f data type tib." + ] + }, + { + "cell_type": "code", + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[12, 34]\n" + "Automatic pdb calling has been turned OFF\n" ] - }, - { - "data": { - "text/plain": [ - "[[12, 34]]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "x = [12,34]\n", - "%f x . cr\n", - "[x]" + "%pdb off" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 用來漂亮顯示多個 DataFrames 的 class 設計 \n", + "\n", + "這個 utility 有很多特點,非常值得學習。\n", + "\n", + "1. 直接使用 class, 而不先產生 instance 然後才拿該 instance 來用。活脫的地方是,利用 class 有很多約定既有的 method 之特性,把呼叫 calss 時的 arguments 用適當的 HTML 或 plain text 顯示出來,漂亮!\n", + "2. `_repr_html_(self)` method 是用 HTML 格式漂漂亮亮地顯示 DataFrame 這是 pandas 本來就有提供給 df 的 built-in method.\n", + "3. 我沒用過的 class 輸入啟始 arguments 的方式。\n", + " " ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 21, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([12, 34])" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "[array([[12],\n", - " [34]]), [91, 92]]" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "array([[[12]],\n", - "\n", - " [[34]]])" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Automatic pdb calling has been turned OFF\n" + ] }, { "data": { - "text/plain": [ - "array([[[[12]]],\n", + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0A0B0C0
1A1B1C1
2A2B2C2
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 A0 B0 C0\n", + "1 A1 B1 C1\n", + "2 A2 B2 C2" ] }, - "execution_count": 39, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" - }, + } + ], + "source": [ + "import peforth\n", + "import pandas as pd\n", + "class display(object):\n", + " \"\"\"Display HTML representation of multiple objects\"\"\"\n", + " template = \"\"\"
\n", + "

{0}

{1}\n", + "
\"\"\"\n", + " def __init__(self, *args):\n", + " peforth.ok('11>',loc=locals(),cmd=\":> [0] constant locals // ( -- locals ) in the __init__ method\")\n", + " self.args = args\n", + " peforth.ok('1122>',loc=locals(),cmd=\":> [0] constant locals // ( -- locals ) in the __init__ method\")\n", + " \n", + " def _repr_html_(self):\n", + " peforth.ok('22>',loc=locals(),cmd=\":> [0] constant locals // ( -- locals ) in the _repr_html_ method\")\n", + " return '\\n'.join(self.template.format(a, eval(a)._repr_html_())\n", + " for a in self.args)\n", + " \n", + " def __repr__(self):\n", + " peforth.ok('33>',loc=locals(),cmd=\":> [0] constant locals // ( -- locals ) in the __repr__ method\")\n", + " return '\\n\\n'.join(a + '\\n' + repr(eval(a))\n", + " for a in self.args)\n", + " \n", + "\n", + "def make_df(cols, ind):\n", + " \"\"\"Quickly make a DataFrame\"\"\"\n", + " data = {c: [str(c) + str(i) for i in ind]\n", + " for c in cols}\n", + " return pd.DataFrame(data, ind) # ind 即 index, 可用 range, list, tuple, or set <-- 自動排序 \n", + "\n", + "# example DataFrame\n", + "make_df('ABC', range(3))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--------------------------\n" + "11>\n", + "11>\n", + "11>\n", + "11>words\n", + "code end-code \\ // bye /// unknown immediate stop compyle trim indent -indent words . cr help interpret-only compile-only literal reveal privacy (create) : ; ( BL CR word ' , [compile] py: py> py:~ py>~ 0branch here! here swap ! @ ? >r r> r@ drop dup over 0< + * - / 1+ 2+ 1- 2- compile if then compiling char last version execute cls private nonprivate (space) exit ret rescan-word-hash (') branch bool and or not (forget) AND OR NOT XOR true false \"\" [] {} none >> << 0= 0> 0<> 0<= 0>= = == > < != >= <= abs max min doVar doNext depth pick roll space [ ] colon-word create (marker) marker next abort alias <> public nip rot -rot 2drop 2dup invert negate within ['] allot for begin until again ahead never repeat aft else while ?stop ?dup variable +! chars spaces .( .\" .' s\" s' s` does> count accept nop \u0004 case of endof endcase refill [else] [if] [then] (::) (:>) :: :> ::~ :>~ \"msg\"abort abort\" \"msg\"?abort ?abort\" ' () (constant) constant value to tib. >t t@ t> [begin] [again] [until] [for] [next] __main__ import module modules int float drops dropall char>ASCII ASCII>char CRLF ASCII .s (*debug*) *debug* readTextFile writeTextFile tib.insert dictate sinclude include break-include type obj>keys obj2dict stringify toString .literal .function (dump) dump dump2ret d (see) see slice screen-buffer display-off display-on WshShell inport harry_port OK _dir_ dir keys (pyclude) pyclude .members .source dos cd --- locals \n", + "11>locals keys . cr\n", + "dict_keys(['args', 'self'])\n", + "11>locals :> ['args'] . cr\n", + "('df1', 'df2', 'pd.concat([df1, df2])')\n", + "11>locals :> ['self'] type . cr\n", + "\n", + "11>locals :> ['self'] (see)\n", + "{\n", + " \"__class__\": \"display\",\n", + " \"__module__\": \"__main__\",\n", + " \"__doc__\": \"Display HTML representation of multiple objects\"\n", + "}\n", + "11>locals :> ['self'] dir . cr\n", + "['template']\n", + "11>locals :> ['self']['template'] . cr\n", + "\n", + "Failed in (compiling=False): 'display' object is not subscriptable\n", + "Body:\n", + "push(pop()['self']['template'])\n", + "11>locals :> ['self'].template . cr\n", + "
\n", + "

{0}

{1}\n", + "
\n", + "11>exit\n", + "11>reDef locals\n", + "1122>\n", + "1122>\n", + "1122>exit\n", + "1122>reDef locals\n", + "33>\n", + "33>\n", + "33>\n", + "33>exit\n", + "33>reDef locals\n", + "22>exit\n", + "22>" ] }, { "data": { + "text/html": [ + "
\n", + "

df1

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
1A1B1
2A2B2
\n", + "
\n", + "
\n", + "
\n", + "

df2

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
3A3B3
4A4B4
\n", + "
\n", + "
\n", + "
\n", + "

pd.concat([df1, df2])

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
1A1B1
2A2B2
3A3B3
4A4B4
\n", + "
\n", + "
" + ], "text/plain": [ - "[array([12, 34])]" + "df1\n", + " A B\n", + "1 A1 B1\n", + "2 A2 B2\n", + "\n", + "df2\n", + " A B\n", + "3 A3 B3\n", + "4 A4 B4\n", + "\n", + "pd.concat([df1, df2])\n", + " A B\n", + "1 A1 B1\n", + "2 A2 B2\n", + "3 A3 B3\n", + "4 A4 B4" ] }, - "execution_count": 39, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "x = np.array([12,34])\n", - "x\n", - "[x[:,np.newaxis], [91,92]]\n", - "x[:,np.newaxis][:,np.newaxis]\n", - "x[:,np.newaxis][:,np.newaxis][:,np.newaxis]\n", - "%f \\ --------------------------\n", - "[x]\n" + "df1 = make_df('AB', [1, 2])\n", + "df2 = make_df('AB', [3, 4])\n", + "display('df1', 'df2', 'pd.concat([df1, df2])') # default axis=0 or 'rows'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "我看懂了,這個 `
` 的 style 就是讓這些 DIV 橫向並著放,而非平常的往下長。這 DIV 裡面有一 P 就是 title 其下就是 data-frame 的表格。\n", + "\n", + " \n", + "

{0}

{1}
\n", + "

{2}

{3}
\n", + "

{4}

{5}
\n", + "
\n", + "\n", + "結果如下:\n", + "\n", + " {0} {2} {4}\n", + " \n", + " {1} {3} {5}\n", + "\n", + "這只解釋了大體的結構,沒有解釋 DataFrame 怎麼印得這麼漂亮?原因是 DataFrame 本身就有個 `._repr_html_()` method,如下顯示:" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 8, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[12, 34]" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - }, - { - "ename": "TypeError", - "evalue": "list indices must be integers or slices, not tuple", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;36m12\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m34\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mTypeError\u001b[0m: list indices must be integers or slices, not tuple" + "name": "stdout", + "output_type": "stream", + "text": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
1A1B1
2A2B2
\n", + "
\n" ] } ], "source": [ - "x = [12,34]\n", - "x\n", - "x[:,np.newaxis]\n", - "x[:,np.newaxis][:,np.newaxis]\n", - "x[:,np.newaxis][:,np.newaxis][:,np.newaxis]" + "%f df1 :> _repr_html_() . cr" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "普通 array 沒有這種能力,這叫 numpy 的啥能力.... 稱作 ufunc (Universal Functions)\n" + " A B\n", + "1 A1 B1\n", + "2 A2 B2\n", + "用 text mode 印出來用的是 __repr__\n", + "\n", + "\n", + "就像這樣\n" ] }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
1A1B1
2A2B2
\n", + "
" + ], "text/plain": [ - "['a', 'b', 'c', 'd', 'e', 'f']" + " A B\n", + "1 A1 B1\n", + "2 A2 B2" ] }, - "execution_count": 29, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "'>' not supported between instances of 'list' and 'str'\n" - ] } ], "source": [ - "# 憑印象 try mask \n", - "%f \\ 普通 array 沒有這種能力,這叫 numpy 的啥能力.... 稱作 ufunc (Universal Functions)\n", - "a = [c for c in \"abcdef\"] \n", - "a\n", - "try:\n", - " a > 'd'\n", - "except TypeError as e:\n", - " print(e)" + "%f df1 . cr \\ 用 text mode 印出來用的是 __repr__ \n", + "%f cr df1 :> __repr__ . cr \\ 就像這樣\n", + "df1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 下面這種寫法是我比較熟悉的\n", + "\n", + "Initialize my_class 時的 arguments 是從 `__init__()` 送進去的。而且是用 variable name 送進去。相對於上面的寫法,初始 arguments 定義在 `my_class()` 處而非 `__init__()` 而且似乎只能接受 type 如 object, int, float 等, 到了 `__init__()` 處則用 `*args` 來取得實際給進來的 arguments. " ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "

~ 4 ~

" + ], "text/plain": [ - "array([ True, True, True, False, False, False], dtype=bool)" + "4" ] }, - "execution_count": 23, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" }, { - "data": { - "text/plain": [ - "array(['a', 'b', 'c'],\n", - " dtype=' ~ \" + str(self.args) + \" ~ \"\n", + " \n", + " def __repr__(self):\n", + " return str(self.args)\n", + "\n", + "my_class(3)\n", + "print(my_class(7))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "" + "### 重複一下這個新方法" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 50, "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "

~ 4 ~

" + ], + "text/plain": [ + "4" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + }, { "name": "stdout", "output_type": "stream", "text": [ - "Now we redefine the 'unknown' command that was do-nothing by default\n", - "reDef unknown\n", - "Redefine \\ command to print the comment line\n", - "reDef \\\n" + "8\n" ] } ], "source": [ - "%run peforth.ipynb" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "%run?" + "class my_class(object):\n", + " def __init__(self, *args):\n", + " self.args = args[0]+1\n", + " \n", + " def _repr_html_(self):\n", + " return \"

~ \" + str(self.args) + \" ~

\"\n", + " \n", + " def __repr__(self):\n", + " return str(self.args)\n", + "\n", + "my_class(3)\n", + "print(my_class(7))" ] } ], From 2ce43c191bde1b25a50e92ef9e34acad410c02e2 Mon Sep 17 00:00:00 2001 From: "H.C.Chen" Date: Sat, 4 Aug 2018 16:53:59 +0800 Subject: [PATCH 11/13] 8/4 --- notebooks/03.07-Merge-and-Join.ipynb | 463 ++++++++++++++++++++++++--- 1 file changed, 427 insertions(+), 36 deletions(-) diff --git a/notebooks/03.07-Merge-and-Join.ipynb b/notebooks/03.07-Merge-and-Join.ipynb index 4b929a6df..416dbdf0e 100644 --- a/notebooks/03.07-Merge-and-Join.ipynb +++ b/notebooks/03.07-Merge-and-Join.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -60,7 +60,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import pandas as pd\n", @@ -256,7 +258,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To combine this information into a single ``DataFrame``, we can use the ``pd.merge()`` function:" + "To combine this information into a single ``DataFrame``, we can use the ``pd.merge()`` function:\n", + "\n", + "這樣看來 pd.merge(a,b) 一定只 merge 兩個 data-frame, 有別於 pd.concat(list) 收一個 list 串接多個 pd.Series 或 data-frame. Merge 的好處是它比較聰明,會自動找出 key column 或用 on, left_on, right_on 指定 key column 或 left_index, right_index 也可以直接用 index; 還可以指定聯集、交集等 how='outer' 'inner' 'left' 'right'; 兩邊有同名的 column 可以用 suffixes=[\"_P\", \"_Q\"] 區分開; " ] }, { @@ -3226,7 +3230,7 @@ "display('df8', 'df9', 'pd.merge(df8, df9, on=\"name\", suffixes=[\"_P\", \"_Q\"])')\n", "%f \\ suffixes=[\"_L\", \"_R\",\"_P\", \"_Q\"] 多給了還不行!\n", "%f \\ 會出錯: ValueError: too many values to unpack (expected 2)\n", - "%f \\ 仔細想想也有道哩,此處還是嚴格一點的好。 \n" + "%f \\ 也有道哩,此處還是不要自作主張亂放行的好,因為資料龐大時全靠電腦檢查錯誤。 \n" ] }, { @@ -3258,7 +3262,9 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Following are shell commands to download the data\n", @@ -3276,7 +3282,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -3480,7 +3486,7 @@ "4 California CA" ] }, - "execution_count": 48, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -3507,13 +3513,26 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -3579,7 +3598,7 @@ "4 AL under18 2011 1125763.0 Alabama" ] }, - "execution_count": 21, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -3587,7 +3606,8 @@ "source": [ "merged = pd.merge(pop, abbrevs, how='outer',\n", " left_on='state/region', right_on='abbreviation')\n", - "merged = merged.drop('abbreviation', 1) # drop duplicate info, where 1 is axis=1\n", + "merged = merged.drop('abbreviation', 1) \n", + " # drop duplicate info, where 1 等效於 axis=1\n", "merged.head()" ] }, @@ -3595,32 +3615,297 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "#### 下面是本章的精華\n", + "用 `df.any()` `df.isnull()` `df.head()` `df.tail()` `df.unique()` `df.dropna()` 等這些 method 來找出 data 裡的問題。然後用 ufunc 來修補它。\n", + "\n", "Let's double-check whether there were any mismatches here, which we can do by looking for rows with nulls:" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "merged :> shape tib. \\ ==> (2544, 5) ()\n" + ] + } + ], + "source": [ + "%f merged :> shape tib. \n", + "# 看一眼 merged 的 shape " + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " state/region ages year population state\n", + "2445 False False False False False\n", + "2446 False False False False False\n", + "2447 False False False False False\n", + "2448 False False False True True\n", + "2449 False False False True True\n" + ] + } + ], + "source": [ + "%f merged :> isnull()[2445:].head() . cr \n", + "# .isnull() method 對整個 df 全都標上 T/F " + ] + }, + { + "cell_type": "code", + "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "state/region False\n", - "ages False\n", - "year False\n", - "population True\n", - "state True\n", - "dtype: bool" + "2448 NaN\n", + "2449 NaN\n", + "2450 NaN\n", + "2451 NaN\n", + "2452 NaN\n", + "Name: state, dtype: object" ] }, - "execution_count": 22, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "merged.isnull().any()" + "merged['state'][merged.isnull()['state']].head()\n", + "# mask 是對 one-dimention array 篩出 True 的。" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
state/regionagesyearpopulationstate
2448PRunder181990NaNNaN
2449PRtotal1990NaNNaN
2450PRtotal1991NaNNaN
2451PRunder181991NaNNaN
2452PRtotal1993NaNNaN
\n", + "
" + ], + "text/plain": [ + " state/region ages year population state\n", + "2448 PR under18 1990 NaN NaN\n", + "2449 PR total 1990 NaN NaN\n", + "2450 PR total 1991 NaN NaN\n", + "2451 PR under18 1991 NaN NaN\n", + "2452 PR total 1993 NaN NaN" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[2448:].head()\n", + "# 上面找到的 NaN 真的是資料本身就是這樣" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
state/regionagesyearpopulationstate
0NaNNaNNaNNaNNaN
1NaNNaNNaNNaNNaN
2NaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " state/region ages year population state\n", + "0 NaN NaN NaN NaN NaN\n", + "1 NaN NaN NaN NaN NaN\n", + "2 NaN NaN NaN NaN NaN\n", + "3 NaN NaN NaN NaN NaN\n", + "4 NaN NaN NaN NaN NaN" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[merged.isnull()].head()\n", + "# [ ] mask 的效果對 2D 的 data-frame 就看不懂了,【篩】怎麼會動到其值?" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "state/region False\n", + "ages False\n", + "year False\n", + "population True\n", + "state True\n", + "dtype: bool\n", + "_ type tib. \\ ==> ()\n", + "結果是個 pd.Series\n" + ] + } + ], + "source": [ + "# 對 data-frame 取 .any() 就是看有沒有含 True 的 column.\n", + "%f merged :> isnull().any() . cr\n", + "%f _ type tib. \\ 結果是個 pd.Series " ] }, { @@ -3632,13 +3917,26 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -3704,7 +4002,7 @@ "2452 PR total 1993 NaN NaN" ] }, - "execution_count": 23, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } @@ -3743,6 +4041,47 @@ "merged.loc[merged['state'].isnull(), 'state/region'].unique()" ] }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " state/region ages year population state\n", + "2448 PR under18 1990 NaN NaN\n", + "2449 PR total 1990 NaN NaN\n", + "2450 PR total 1991 NaN NaN\n", + "2451 PR under18 1991 NaN NaN\n", + "2452 PR total 1993 NaN NaN\n", + "先從整個裡面篩出有問題的來\n", + "\n", + "2448 PR\n", + "2449 PR\n", + "2450 PR\n", + "2451 PR\n", + "2452 PR\n", + "Name: state/region, dtype: object\n", + "只看有問題的 'state/region'\n", + "\n", + "['PR' 'USA']\n", + "確定有哪些 'state/region'\n" + ] + } + ], + "source": [ + "# 要找出是那些 states 含 null 我來做就是一步一步來;\n", + "# 老師上面這樣一行完成太厲害了,對我來說不真實。\n", + "_ = merged[merged['state'].isnull()]\n", + "%f cr _ :> head() . cr \\ 先從整個裡面篩出有問題的來\n", + "_ = _['state/region'] \n", + "%f cr _ :> head() . cr \\ 只看有問題的 'state/region'\n", + "%f cr _ :> unique() . cr \\ 確定是哪些 'state/region'" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -3753,7 +4092,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 141, "metadata": {}, "outputs": [ { @@ -3767,7 +4106,7 @@ "dtype: bool" ] }, - "execution_count": 25, + "execution_count": 141, "metadata": {}, "output_type": "execute_result" } @@ -3790,13 +4129,26 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 142, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -3868,7 +4220,7 @@ "4 AL under18 2011 1125763.0 Alabama 52423.0" ] }, - "execution_count": 26, + "execution_count": 142, "metadata": {}, "output_type": "execute_result" } @@ -3887,7 +4239,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 143, "metadata": {}, "outputs": [ { @@ -3902,7 +4254,7 @@ "dtype: bool" ] }, - "execution_count": 27, + "execution_count": 143, "metadata": {}, "output_type": "execute_result" } @@ -3920,7 +4272,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 144, "metadata": {}, "outputs": [ { @@ -3929,7 +4281,7 @@ "array(['United States'], dtype=object)" ] }, - "execution_count": 28, + "execution_count": 144, "metadata": {}, "output_type": "execute_result" } @@ -3948,13 +4300,26 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -4026,21 +4391,45 @@ "4 AL under18 2011 1125763.0 Alabama 52423.0" ] }, - "execution_count": 29, + "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "final.dropna(inplace=True)\n", + "final.dropna(inplace=True) # inplace=True 就是改進原來的 data-frame 裡去。\n", "final.head()" ] }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "state/region False\n", + "ages False\n", + "year False\n", + "population False\n", + "state False\n", + "area (sq. mi) False\n", + "dtype: bool\n", + "確定 nulls 都削掉了,final 可用了。\n" + ] + } + ], + "source": [ + "%f final :> isnull().any() . cr \\ 確定 nulls 都削掉了,final 可用了。" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we have all the data we need. To answer the question of interest, let's first select the portion of the data corresponding with the year 2000, and the total population.\n", + "Now we have all the data we need. To answer the question of interest, let's first select the portion of the data corresponding with the year 2010, and the total population.\n", "We'll use the ``query()`` function to do this quickly (this requires the ``numexpr`` package to be installed; see [High-Performance Pandas: ``eval()`` and ``query()``](03.12-Performance-Eval-and-Query.ipynb)):" ] }, @@ -4145,10 +4534,12 @@ { "cell_type": "code", "execution_count": 31, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "data2010.set_index('state', inplace=True)\n", + "data2010.set_index('state', inplace=True) # 又是 inplace=True 改進原 df 裡去。\n", "density = data2010['population'] / data2010['area (sq. mi)']" ] }, @@ -4251,7 +4642,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.0" + "version": "3.6.1" } }, "nbformat": 4, From fa90a0a9ec4e0b062ea6d3b694fcf0f53e4b5c67 Mon Sep 17 00:00:00 2001 From: "H.C.Chen" Date: Sun, 5 Aug 2018 12:21:54 +0800 Subject: [PATCH 12/13] 8/5 --- notebooks/03.07-Merge-and-Join.ipynb | 880 ++++++++---------- .../03.08-Aggregation-and-Grouping.ipynb | 335 +++++-- 2 files changed, 607 insertions(+), 608 deletions(-) diff --git a/notebooks/03.07-Merge-and-Join.ipynb b/notebooks/03.07-Merge-and-Join.ipynb index 416dbdf0e..637162612 100644 --- a/notebooks/03.07-Merge-and-Join.ipynb +++ b/notebooks/03.07-Merge-and-Join.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 64, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -122,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -241,7 +241,7 @@ "3 Sue 2014" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -331,7 +331,7 @@ "3 Sue HR 2014" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -369,193 +369,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "

df3

\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
employeegrouphire_date
0BobAccounting2008
1JakeEngineering2012
2LisaEngineering2004
3SueHR2014
\n", - "
\n", - "
\n", - "
\n", - "

df4

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
groupsupervisor
0AccountingCarly
1EngineeringGuido
2HRSteve
\n", - "
\n", - "
\n", - "
\n", - "

pd.merge(df3, df4)

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
employeegrouphire_datesupervisor
0BobAccounting2008Carly
1JakeEngineering2012Guido
2LisaEngineering2004Guido
3SueHR2014Steve
\n", - "
\n", - "
" - ], - "text/plain": [ - "df3\n", - " employee group hire_date\n", - "0 Bob Accounting 2008\n", - "1 Jake Engineering 2012\n", - "2 Lisa Engineering 2004\n", - "3 Sue HR 2014\n", - "\n", - "df4\n", - " group supervisor\n", - "0 Accounting Carly\n", - "1 Engineering Guido\n", - "2 HR Steve\n", - "\n", - "pd.merge(df3, df4)\n", - " employee group hire_date supervisor\n", - "0 Bob Accounting 2008 Carly\n", - "1 Jake Engineering 2012 Guido\n", - "2 Lisa Engineering 2004 Guido\n", - "3 Sue HR 2014 Steve" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - }, { "name": "stdout", "output_type": "stream", @@ -599,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -818,7 +634,7 @@ "7 Sue HR organization" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -866,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -874,6 +690,19 @@ "text/html": [ "
\n", "

df1

\n", + "\n", "\n", " \n", " \n", @@ -909,6 +738,19 @@ " \n", "
\n", "

df2

\n", + "\n", "
\n", " \n", " \n", @@ -944,6 +786,19 @@ " \n", "
\n", "

pd.merge(df1, df2, on='employee')

\n", + "\n", "
\n", " \n", " \n", @@ -1006,7 +861,7 @@ "3 Sue HR 2014" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1034,7 +889,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -1218,7 +1073,7 @@ "3 Sue HR Sue 90000" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1238,7 +1093,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -1304,7 +1159,7 @@ "3 Sue HR 90000" ] }, - "execution_count": 11, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1325,7 +1180,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -1562,7 +1417,7 @@ "Sue 2014" ] }, - "execution_count": 32, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1583,7 +1438,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1770,7 +1625,7 @@ "Sue HR 2014" ] }, - "execution_count": 27, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1782,7 +1637,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1850,10 +1705,21 @@ "Sue HR 2014" ] }, - "execution_count": 33, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" - }, + } + ], + "source": [ + "%f \\ 既然已經用 df.set_index() 弄好了 index, 用 pd.concat() 也可以\n", + "pd.concat([df1a, df2a], axis='columns')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", @@ -1864,8 +1730,6 @@ } ], "source": [ - "%f \\ 既然已經用 df.set_index() 弄好了 index, 用 pd.concat() 也可以\n", - "pd.concat([df1a, df2a], axis='columns')\n", "%f _ type tib. \\ 結果很類似,index 的 name 不見了,但還是 data-frame " ] }, @@ -1878,7 +1742,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": { "scrolled": true }, @@ -1888,6 +1752,19 @@ "text/html": [ "
\n", "

df1a

\n", + "\n", "
\n", " \n", " \n", @@ -1922,6 +1799,19 @@ " \n", "
\n", "

df2a

\n", + "\n", "
\n", " \n", " \n", @@ -1956,6 +1846,19 @@ " \n", "
\n", "

df1a.join(df2a)

\n", + "\n", "
\n", " \n", " \n", @@ -2021,7 +1924,7 @@ "Sue HR 2014" ] }, - "execution_count": 11, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -2039,7 +1942,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -2130,7 +2033,7 @@ "Sue NaN 2014.0" ] }, - "execution_count": 38, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -2148,7 +2051,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -2156,6 +2059,19 @@ "text/html": [ "
\n", "

df1a

\n", + "\n", "
\n", " \n", " \n", @@ -2190,6 +2106,19 @@ " \n", "
\n", "

df3

\n", + "\n", "
\n", " \n", " \n", @@ -2225,6 +2154,19 @@ " \n", "
\n", "

pd.merge(df1a, df3, left_index=True, right_on='name')

\n", + "\n", "
\n", " \n", " \n", @@ -2288,7 +2230,7 @@ "3 HR Sue 90000" ] }, - "execution_count": 12, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -2322,7 +2264,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -2462,7 +2404,7 @@ "0 Mary bread wine" ] }, - "execution_count": 39, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2488,13 +2430,26 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -2520,7 +2475,7 @@ "0 Mary bread wine" ] }, - "execution_count": 14, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2539,7 +2494,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2547,147 +2502,19 @@ "text/html": [ "
\n", "

df6

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namefood
0Peterfish
1Paulbeans
2Marybread
\n", - "
\n", - "
\n", - "
\n", - "

df7

\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namedrink
0Marywine
1Josephbeer
\n", - "
\n", - "
\n", - "
\n", - "

pd.merge(df6, df7, how='outer')

\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namefooddrink
0PeterfishNaN
1PaulbeansNaN
2Marybreadwine
3JosephNaNbeer
\n", - "
\n", - "
" - ], - "text/plain": [ - "df6\n", - " name food\n", - "0 Peter fish\n", - "1 Paul beans\n", - "2 Mary bread\n", + "\n", "\n", " \n", " \n", @@ -2718,6 +2545,19 @@ " \n", "
\n", "

df7

\n", + "\n", "
\n", " \n", " \n", @@ -2742,7 +2582,20 @@ "\n", " \n", "
\n", - "

pd.merge(df6, df7, how='left')

\n", + "

pd.merge(df6, df7, how='outer')

\n", + "\n", "
\n", " \n", " \n", @@ -2771,6 +2624,12 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
breadwine
3JosephNaNbeer
\n", "
\n", @@ -2788,57 +2647,41 @@ "0 Mary wine\n", "1 Joseph beer\n", "\n", - "pd.merge(df6, df7, how='left')\n", - " name food drink\n", - "0 Peter fish NaN\n", - "1 Paul beans NaN\n", - "2 Mary bread wine" + "pd.merge(df6, df7, how='outer')\n", + " name food drink\n", + "0 Peter fish NaN\n", + "1 Paul beans NaN\n", + "2 Mary bread wine\n", + "3 Joseph NaN beer" ] }, - "execution_count": 16, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "display('df6', 'df7', \"pd.merge(df6, df7, how='left')\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The output rows now correspond to the entries in the left input. Using\n", - "``how='right'`` works in a similar manner.\n", - "\n", - "All of these options can be applied straightforwardly to any of the preceding join types." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overlapping Column Names: The ``suffixes`` Keyword" + "display('df6', 'df7', \"pd.merge(df6, df7, how='outer')\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, you may end up in a case where your two input ``DataFrame``s have conflicting column names.\n", - "Consider this example:" + "The *left join* and *right join* return joins over the left entries and right entries, respectively.\n", + "For example:" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "

df8

\n", + "

df6

\n", "\n", "\n", " \n", " \n", @@ -4513,7 +4379,7 @@ "197 CA total 2010 37333601.0 California 163707.0" ] }, - "execution_count": 30, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -4533,7 +4399,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 42, "metadata": { "collapsed": true }, @@ -4545,7 +4411,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -4560,7 +4426,7 @@ "dtype: float64" ] }, - "execution_count": 32, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -4582,7 +4448,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -4597,7 +4463,7 @@ "dtype: float64" ] }, - "execution_count": 33, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/03.08-Aggregation-and-Grouping.ipynb b/notebooks/03.08-Aggregation-and-Grouping.ipynb index b30bcab93..22e04cc11 100644 --- a/notebooks/03.08-Aggregation-and-Grouping.ipynb +++ b/notebooks/03.08-Aggregation-and-Grouping.ipynb @@ -1,5 +1,23 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reDef unknown\n", + "reDef \\\n" + ] + } + ], + "source": [ + "%run peforth.ipynb" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -23,7 +41,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Aggregation and Grouping" + "# Aggregation 集合 and Grouping" ] }, { @@ -33,7 +51,9 @@ }, "source": [ "An essential piece of analysis of large data is efficient summarization: computing aggregations like ``sum()``, ``mean()``, ``median()``, ``min()``, and ``max()``, in which a single number gives insight into the nature of a potentially large dataset.\n", - "In this section, we'll explore aggregations in Pandas, from simple operations akin to what we've seen on NumPy arrays, to more sophisticated operations based on the concept of a ``groupby``." + "In this section, we'll explore aggregations in Pandas, from simple operations akin to 類似於 what we've seen on NumPy arrays, to more sophisticated operations based on the concept of a ``groupby``.\n", + "\n", + "從下面的說明看來,所謂 Aggregation 就是 ufunc 的 Data-Frame 版,除了一般的加減乘除,還加上 .describe(), .std(), .var(), 更不用說 .count(), .min(), .max() 等等,這些統計上的 Universal Functions or Aggregations 了。" ] }, { @@ -45,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -83,10 +103,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, "outputs": [ { "data": { @@ -94,7 +112,7 @@ "(1035, 6)" ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -107,15 +125,26 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, + "execution_count": 11, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -187,7 +216,7 @@ "4 Radial Velocity 1 516.220 10.50 119.47 2009" ] }, - "execution_count": 3, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -196,6 +225,151 @@ "planets.head()" ] }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
methodnumberorbital_periodmassdistanceyear
1030Transit13.941507NaN172.02006
1031Transit12.615864NaN148.02007
1032Transit13.191524NaN174.02007
1033Transit14.125083NaN293.02008
1034Transit14.187757NaN260.02008
\n", + "
" + ], + "text/plain": [ + " method number orbital_period mass distance year\n", + "1030 Transit 1 3.941507 NaN 172.0 2006\n", + "1031 Transit 1 2.615864 NaN 148.0 2007\n", + "1032 Transit 1 3.191524 NaN 174.0 2007\n", + "1033 Transit 1 4.125083 NaN 293.0 2008\n", + "1034 Transit 1 4.187757 NaN 260.0 2008" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "planets.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "planets type tib. \\ ==> ()\n", + "這是 data-frame\n", + "\n", + "method False\n", + "number False\n", + "orbital_period True\n", + "mass True\n", + "distance True\n", + "year False\n", + "dtype: bool\n", + "3 個 column 有問題。\n", + "\n", + "method False\n", + "number False\n", + "orbital_period False\n", + "mass False\n", + "distance False\n", + "year False\n", + "dtype: bool\n", + "pd.dropna() 的效果\n", + "\n", + "cr planets :> dropna() :> shape tib. \\ ==> (498, 6) ()\n", + "本來是 (1035, 6) 所剩無幾\n" + ] + } + ], + "source": [ + "# 現學現賣,我想想怎麼列出有問題的 columns? pd.isnull\n", + "%f planets type tib. \\ 這是 data-frame\n", + "%f cr planets :> isnull().any() . cr \\ 3 個 column 有問題。\n", + "%f cr planets :> dropna().isnull().any() . cr \\ pd.dropna() 的效果\n", + "%f cr planets :> dropna() :> shape tib. \\ 本來是 (1035, 6) 所剩無幾\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -221,9 +395,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -250,9 +422,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -272,9 +442,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -301,9 +469,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -370,9 +536,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -401,9 +565,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -436,9 +598,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -550,7 +710,7 @@ "metadata": {}, "source": [ "This can be a useful way to begin understanding the overall properties of a dataset.\n", - "For example, we see in the ``year`` column that although exoplanets were discovered as far back as 1989, half of all known expolanets were not discovered until 2010 or after.\n", + "For example, we see in the ``year`` column that although exoplanets were discovered as far back as 1989, half of all known exoplanets were not discovered until 2010 or after.\n", "This is largely thanks to the *Kepler* mission, which is a space-based telescope specifically designed for finding eclipsing planets around other stars." ] }, @@ -596,9 +756,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Split, apply, combine\n", + "### Split, apply, combine 這個好玩,看圖就知道\n", "\n", - "A canonical example of this split-apply-combine operation, where the \"apply\" is a summation aggregation, is illustrated in this figure:" + "A canonical 依教规的 example of this split-apply-combine operation, where the \"apply\" is a summation aggregation, is illustrated in this figure:" ] }, { @@ -628,15 +788,26 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, + "execution_count": 21, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -690,7 +861,7 @@ "5 C 5" ] }, - "execution_count": 11, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -711,9 +882,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -744,9 +913,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -830,9 +997,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -852,9 +1017,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -882,9 +1045,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -931,9 +1092,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -977,9 +1136,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1189,9 +1346,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1285,9 +1440,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1379,9 +1532,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1451,9 +1602,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1644,9 +1793,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1729,9 +1876,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1904,9 +2049,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2031,9 +2174,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2165,9 +2306,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2288,9 +2427,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2415,9 +2552,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2483,9 +2618,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2647,9 +2780,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From bf07b4eec34546c0c3cdeeb6170d15eedcd3414e Mon Sep 17 00:00:00 2001 From: "H.C. Chen" Date: Sun, 5 Aug 2018 17:07:59 +0800 Subject: [PATCH 13/13] =?UTF-8?q?8/5=20=EF=BC=A0=20=E5=87=B1=E6=82=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- notebooks/03.07-Merge-and-Join.ipynb | 2 +- .../03.08-Aggregation-and-Grouping.ipynb | 1528 +++++++++++++++-- 2 files changed, 1425 insertions(+), 105 deletions(-) diff --git a/notebooks/03.07-Merge-and-Join.ipynb b/notebooks/03.07-Merge-and-Join.ipynb index 637162612..10901bf98 100644 --- a/notebooks/03.07-Merge-and-Join.ipynb +++ b/notebooks/03.07-Merge-and-Join.ipynb @@ -4508,7 +4508,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.6.0" } }, "nbformat": 4, diff --git a/notebooks/03.08-Aggregation-and-Grouping.ipynb b/notebooks/03.08-Aggregation-and-Grouping.ipynb index 22e04cc11..cda328feb 100644 --- a/notebooks/03.08-Aggregation-and-Grouping.ipynb +++ b/notebooks/03.08-Aggregation-and-Grouping.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", @@ -66,9 +68,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -112,7 +112,7 @@ "(1035, 6)" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -125,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -216,7 +216,7 @@ "4 Radial Velocity 1 516.220 10.50 119.47 2009" ] }, - "execution_count": 11, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -227,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -318,7 +318,7 @@ "1034 Transit 1 4.187757 NaN 260.0 2008" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -329,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -394,7 +394,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -408,7 +408,7 @@ "dtype: float64" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -421,16 +421,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "2.8119254917081569" + "2.811925491708157" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -441,16 +441,16 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.56238509834163142" + "0.5623850983416314" ] }, - "execution_count": 6, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -468,13 +468,26 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -522,7 +535,7 @@ "4 0.708073 0.181825" ] }, - "execution_count": 7, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -535,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -546,7 +559,7 @@ "dtype: float64" ] }, - "execution_count": 8, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -564,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -578,7 +591,7 @@ "dtype: float64" ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -597,13 +610,26 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -696,7 +722,7 @@ "max 6.00000 17337.500000 25.000000 354.000000 2014.000000" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -788,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -861,7 +887,7 @@ "5 C 5" ] }, - "execution_count": 21, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -881,21 +907,22 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# 結果是個 pd.DataFrameGroupBy object \n", "df.groupby('key')" ] }, @@ -912,13 +939,26 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -955,7 +995,7 @@ "C 7" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -991,21 +1031,47 @@ "#### Column indexing\n", "\n", "The ``GroupBy`` object supports column indexing in the same way as the ``DataFrame``, and returns a modified ``GroupBy`` object.\n", - "For example:" + "For example:\n", + "\n", + "既然要 GroupBy 那我猜 `df.groupby('key')` 的 key column 必定要有重複的 items, 否則有何意義?" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "array(['Radial Velocity', 'Imaging', 'Eclipse Timing Variations',\n", + " 'Transit', 'Astrometry', 'Transit Timing Variations',\n", + " 'Orbital Brightness Modulation', 'Microlensing', 'Pulsar Timing',\n", + " 'Pulsation Timing Variations'], dtype=object)" ] }, - "execution_count": 14, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 學以致用,查看得知確實 'method' column 下有很多種,每種當然都重複出現很多次在 method 欄下。\n", + "planets['method'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1016,22 +1082,33 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "planets :> columns tib. \\ ==> Index(['method', 'number', 'orbital_period', 'mass', 'distance', 'year'], dtype='object') ()\n", + "有 'orbital_period' 這個 column\n" + ] + }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 15, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "planets.groupby('method')['orbital_period']" + "# 到這裡,瞭解了前述 \"lazy evaluation\" 的意思了, groupby object 指定好對象的 statements \n", + "# 都只產生 groupby object 而未真的做事\n", + "%f planets :> columns tib. \\ 有 'orbital_period' 這個 column \n", + "planets.groupby('method')['orbital_period']\n" ] }, { @@ -1044,7 +1121,35 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " method number orbital_period mass distance year\n", + "113 Astrometry 1 246.36 NaN 20.77 2013\n", + "537 Astrometry 1 1016.00 NaN 14.98 2010\n", + "\n", + "planets :> query('method==\"Astrometry\"') type tib. \\ ==> ()\n", + " method number orbital_period mass distance year\n", + "958 Pulsation Timing Variations 1 1170.0 NaN NaN 2007\n", + "這方法只發現一顆,特別有助於瞭解 groupby 的應用。\n" + ] + } + ], + "source": [ + "# 我來先看清楚這個局部,下一 cell 的意思就明白了。\n", + "%f planets :> query('method==\"Astrometry\"') . cr cr \n", + "%f planets :> query('method==\"Astrometry\"') type tib.\n", + "%f planets :>~ query('method==\"Pulsation Timing Variations\"')\n", + "%f . cr \\ 這方法只發現一顆,特別有助於瞭解 groupby 的應用。 " + ] + }, + { + "cell_type": "code", + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1064,13 +1169,15 @@ "Name: orbital_period, dtype: float64" ] }, - "execution_count": 16, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "planets.groupby('method')['orbital_period'].median()" + "planets.groupby('method')['orbital_period'].median() \n", + "# Astrometry 的 min() 就是 246.36 max() 就是 1016.00 所以 groupby on 'method' \n", + "# 但是 Aggregation .median() 是取 'orbital_period', 明白了。" ] }, { @@ -1084,14 +1191,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "\n", "#### Iteration over groups\n", "\n", + "想看看 Groupby object 即 planets.groupby('method') 裡面啥東西可以用 groupby.head() .tail() 查看。\n", + "也可以用 `[(key,group) for (key,group) in df.groupby('column')]` 去展開它,這就透露出了 groupby \n", + "object 在 iterating 時吐出來的東西是個 tuple 其中 key group 兩個只是代名詞可以任取。而 key 就是下面例子中的 'Astrometry','Pulsation Timing Variations' 等等 method 的 instances 一個個輪番跑出來; 這些 key 所對應的 group 就是以下例子跑出來的結果,其 type 還是 data-frame, 之前已經研究過了。\n", + "\n", + " %f planets :> query('method==\"Astrometry\"') . cr cr \n", + " %f planets :>~ query('method==\"Pulsation Timing Variations\"')\n", + " ...\n", + "\n", + "\n", "The ``GroupBy`` object supports direct iteration over the groups, returning each group as a ``Series`` or ``DataFrame``:" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1135,13 +1252,26 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1310,7 +1440,112 @@ "Transit Timing Variations 2012.5 2013.25 2014.0 " ] }, - "execution_count": 18, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "planets.groupby('method')['year'].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_ type tib. \\ ==> ()\n", + "df.describe() 的結果還是個 data-frame\n", + "_ :> shape tib. \\ ==> (10, 8) ()\n", + "當然 shape 是它自己的樣子\n" + ] + } + ], + "source": [ + "%f _ type tib. \\ df.describe() 的結果還是個 data-frame\n", + "%f _ :> shape tib. \\ 當然 shape 是它自己的樣子" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + " method \n", + "count Astrometry 2.000000\n", + " Eclipse Timing Variations 9.000000\n", + " Imaging 38.000000\n", + " Microlensing 23.000000\n", + " Orbital Brightness Modulation 3.000000\n", + " Pulsar Timing 5.000000\n", + " Pulsation Timing Variations 1.000000\n", + " Radial Velocity 553.000000\n", + " Transit 397.000000\n", + " Transit Timing Variations 4.000000\n", + "mean Astrometry 2011.500000\n", + " Eclipse Timing Variations 2010.000000\n", + " Imaging 2009.131579\n", + " Microlensing 2009.782609\n", + " Orbital Brightness Modulation 2011.666667\n", + " Pulsar Timing 1998.400000\n", + " Pulsation Timing Variations 2007.000000\n", + " Radial Velocity 2007.518987\n", + " Transit 2011.236776\n", + " Transit Timing Variations 2012.500000\n", + "std Astrometry 2.121320\n", + " Eclipse Timing Variations 1.414214\n", + " Imaging 2.781901\n", + " Microlensing 2.859697\n", + " Orbital Brightness Modulation 1.154701\n", + " Pulsar Timing 8.384510\n", + " Pulsation Timing Variations NaN\n", + " Radial Velocity 4.249052\n", + " Transit 2.077867\n", + " Transit Timing Variations 1.290994\n", + " ... \n", + "50% Astrometry 2011.500000\n", + " Eclipse Timing Variations 2010.000000\n", + " Imaging 2009.000000\n", + " Microlensing 2010.000000\n", + " Orbital Brightness Modulation 2011.000000\n", + " Pulsar Timing 1994.000000\n", + " Pulsation Timing Variations 2007.000000\n", + " Radial Velocity 2009.000000\n", + " Transit 2012.000000\n", + " Transit Timing Variations 2012.500000\n", + "75% Astrometry 2012.250000\n", + " Eclipse Timing Variations 2011.000000\n", + " Imaging 2011.000000\n", + " Microlensing 2012.000000\n", + " Orbital Brightness Modulation 2012.000000\n", + " Pulsar Timing 2003.000000\n", + " Pulsation Timing Variations 2007.000000\n", + " Radial Velocity 2011.000000\n", + " Transit 2013.000000\n", + " Transit Timing Variations 2013.250000\n", + "max Astrometry 2013.000000\n", + " Eclipse Timing Variations 2012.000000\n", + " Imaging 2013.000000\n", + " Microlensing 2013.000000\n", + " Orbital Brightness Modulation 2013.000000\n", + " Pulsar Timing 2011.000000\n", + " Pulsation Timing Variations 2007.000000\n", + " Radial Velocity 2014.000000\n", + " Transit 2014.000000\n", + " Transit Timing Variations 2014.000000\n", + "Length: 80, dtype: float64" + ] + }, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -1319,6 +1554,40 @@ "planets.groupby('method')['year'].describe().unstack()" ] }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_ type tib. \\ ==> ()\n", + "df.unstack() 的結果就變成 pd.Series 了,而且是 Multi-Indexed 往下看就知道。\n", + "\n", + "cr _ :> shape tib. \\ ==> (80,) ()\n", + "shape 變了\n", + "\n", + "cr _ :> index tib. \\ ==> MultiIndex(levels=[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], ['Astrometry', 'Eclipse Timing Variations', 'Imaging', 'Microlensing', 'Orbital Brightness Modulation', 'Pulsar Timing', 'Pulsation Timing Variations', 'Radial Velocity', 'Transit', 'Transit Timing Variations']],\n", + " labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]],\n", + " names=[None, 'method']) ()\n", + "See! Multi-Indexed\n", + "\n", + "cr _ :> values[:5] tib. \\ ==> [ 2. 9. 38. 23. 3.] ()\n", + "既然看過了 pd.Series.index, 順便也複習一下 ~.values\n" + ] + } + ], + "source": [ + "%f _ type tib. \\ df.unstack() 的結果就變成 pd.Series 了,而且是 Multi-Indexed 往下看就知道。\n", + "%f cr _ :> shape tib. \\ shape 變了\n", + "%f cr _ :> index tib. \\ See! Multi-Indexed \n", + "%f cr _ :> values[:5] tib. \\ 既然看過了 pd.Series.index, 順便也複習一下 ~.values" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1345,13 +1614,26 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1412,7 +1694,7 @@ "5 C 5 9" ] }, - "execution_count": 19, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -1434,18 +1716,33 @@ "\n", "We're now familiar with ``GroupBy`` aggregations with ``sum()``, ``median()``, and the like, but the ``aggregate()`` method allows for even more flexibility.\n", "It can take a string, a function, or a list thereof, and compute all the aggregates at once.\n", - "Here is a quick example combining all these:" + "Here is a quick example combining all these:\n", + "\n", + "groupby.aggregate() 就是一般化的 .describe() 但是他能接受的 ufunc 形式也真是太有彈性了!\n" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1513,7 +1810,7 @@ "C 2 3.5 5 3 6.0 9" ] }, - "execution_count": 20, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -1531,24 +1828,37 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "\n", + "
data1data2
key
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1579,12 +1889,14 @@ "C 2 9" ] }, - "execution_count": 21, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# groupby 可當成 data-frame 故 data1, data2 視為 column. 倒是 'min', 'max' 被 aggregate()\n", + "# 當成 universal function name 比較厲害。\n", "df.groupby('key').aggregate({'data1': 'min',\n", " 'data2': 'max'})" ] @@ -1601,7 +1913,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 62, "metadata": {}, "outputs": [ { @@ -1609,6 +1921,19 @@ "text/html": [ "
\n", "

df

\n", + "\n", "
data1data2
key
\n", " \n", " \n", @@ -1661,6 +1986,19 @@ " \n", "
\n", "

df.groupby('key').std()

\n", + "\n", "
\n", " \n", " \n", @@ -1696,6 +2034,19 @@ " \n", "
\n", "

df.groupby('key').filter(filter_func)

\n", + "\n", "
\n", " \n", " \n", @@ -1760,7 +2111,7 @@ "5 C 5 9" ] }, - "execution_count": 22, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -1792,13 +2143,26 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1852,7 +2216,7 @@ "5 1.5 3.0" ] }, - "execution_count": 23, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -1875,7 +2239,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -1883,6 +2247,19 @@ "text/html": [ "
\n", "

df

\n", + "\n", "
\n", " \n", " \n", @@ -1935,6 +2312,19 @@ " \n", "
\n", "

df.groupby('key').apply(norm_by_data2)

\n", + "\n", "
\n", " \n", " \n", @@ -2006,7 +2396,7 @@ "5 C 0.416667 9" ] }, - "execution_count": 24, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -2048,7 +2438,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 65, "metadata": {}, "outputs": [ { @@ -2056,6 +2446,19 @@ "text/html": [ "
\n", "

df

\n", + "\n", "
\n", " \n", " \n", @@ -2108,6 +2511,19 @@ " \n", "
\n", "

df.groupby(L).sum()

\n", + "\n", "
\n", " \n", " \n", @@ -2154,7 +2570,7 @@ "2 4 7" ] }, - "execution_count": 25, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -2173,7 +2589,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 66, "metadata": {}, "outputs": [ { @@ -2181,6 +2597,19 @@ "text/html": [ "
\n", "

df

\n", + "\n", "
\n", " \n", " \n", @@ -2233,6 +2662,19 @@ " \n", "
\n", "

df.groupby(df['key']).sum()

\n", + "\n", "
\n", " \n", " \n", @@ -2285,7 +2727,7 @@ "C 7 12" ] }, - "execution_count": 26, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -2305,7 +2747,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 67, "metadata": {}, "outputs": [ { @@ -2313,6 +2755,19 @@ "text/html": [ "
\n", "

df2

\n", + "\n", "
\n", " \n", " \n", @@ -2363,6 +2818,19 @@ " \n", "
\n", "

df2.groupby(mapping).sum()

\n", + "\n", "
\n", " \n", " \n", @@ -2404,7 +2872,7 @@ "vowel 3 8" ] }, - "execution_count": 27, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -2426,7 +2894,37 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "py> str.lower tib. \\ ==> ()\n", + "df2 :> index tib. \\ ==> Index(['A', 'B', 'C', 'A', 'B', 'C'], dtype='object', name='key') ()\n" + ] + }, + { + "data": { + "text/plain": [ + "\"index(['a', 'b', 'c', 'a', 'b', 'c'], dtype='object', name='key')\"" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%f py> str.lower tib.\n", + "%f df2 :> index tib.\n", + "str(df2.index).lower()" + ] + }, + { + "cell_type": "code", + "execution_count": 68, "metadata": {}, "outputs": [ { @@ -2434,6 +2932,19 @@ "text/html": [ "
\n", "

df2

\n", + "\n", "
\n", " \n", " \n", @@ -2484,6 +2995,19 @@ " \n", "
\n", "

df2.groupby(str.lower).mean()

\n", + "\n", "
\n", " \n", " \n", @@ -2531,12 +3055,13 @@ "c 3.5 6.0" ] }, - "execution_count": 28, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# 我猜,df2 有 index 因此 str.lower method 對 index 作用\n", "display('df2', 'df2.groupby(str.lower).mean()')" ] }, @@ -2551,13 +3076,26 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -2597,7 +3135,7 @@ "c consonant 3.5 6.0" ] }, - "execution_count": 29, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } @@ -2617,25 +3155,810 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 88, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "name": "stdout", + "output_type": "stream", + "text": [ + "decade type tib. \\ ==> ()\n", + "這東西一開始就是 pd.Series\n", + "\n", + "cr decade type tib. \\ ==> ()\n", + "尾綴上 's' 的手法我很陌生\n", + "\n", + "cr decade :> head() tib. \\ ==> 0 2000s\n", + "1 2000s\n", + "2 2010s\n", + "3 2000s\n", + "4 2000s\n", + "Name: year, dtype: object ()\n", + ".name 加上之前\n", + "\n", + "cr decade :> head() tib. \\ ==> 0 2000s\n", + "1 2000s\n", + "2 2010s\n", + "3 2000s\n", + "4 2000s\n", + "Name: decade, dtype: object ()\n", + ".name 加上之後\n" + ] + } + ], + "source": [ + "decade = 10 * (planets['year'] // 10)\n", + "%f decade type tib. \\ 這東西一開始就是 pd.Series\n", + "decade = decade.astype(str) + 's' \n", + "%f cr decade type tib. \\ 尾綴上 's' 的手法我很陌生\n", + "%f cr decade :> head() tib. \\ .name 改掉之前,本來是 'year'\n", + "decade.name = 'decade'\n", + "%f cr decade :> head() tib. \\ .name 改掉之後,變成 'decade'" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
decade1980s1990s2000s2010s
method
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
methodnumberorbital_periodmassdistanceyear
0Radial Velocity1269.3000007.10077.402006
1Radial Velocity1874.7740002.21056.952008
2Radial Velocity1763.0000002.60019.842011
3Radial Velocity1326.03000019.400110.622007
4Radial Velocity1516.22000010.500119.472009
5Radial Velocity1185.8400004.80076.392008
7Radial Velocity1798.500000NaN21.411996
9Radial Velocity2452.8000001.99074.792010
10Radial Velocity2883.0000000.86074.792010
13Radial Velocity31078.0000002.53014.081996
16Radial Velocity14.2307850.47215.361995
17Radial Velocity514.6510000.80012.531996
20Radial Velocity50.736540NaN12.532011
25Radial Velocity1116.688400NaN18.111996
26Radial Velocity1691.900000NaN81.502012
29Imaging1NaNNaN45.522005
30Imaging1NaNNaN165.002007
31Imaging1NaNNaN140.002004
32Eclipse Timing Variations110220.0000006.050NaN2009
33Imaging1NaNNaNNaN2008
34Imaging1NaNNaN145.002013
35Imaging1NaNNaN139.002004
37Eclipse Timing Variations25767.000000NaN130.722008
38Eclipse Timing Variations23321.000000NaN130.722008
39Eclipse Timing Variations25573.550000NaN500.002010
40Eclipse Timing Variations22883.500000NaN500.002010
41Eclipse Timing Variations12900.000000NaNNaN2011
42Eclipse Timing Variations14343.5000004.200NaN2012
43Eclipse Timing Variations25840.000000NaNNaN2011
54Imaging1NaNNaN52.032012
.....................
101Transit113.240600NaN345.002010
102Transit12.994330NaN560.002010
103Transit12.828042NaN1150.002010
104Transit14.035190NaN1060.002010
113Astrometry1246.360000NaN20.772013
441Radial Velocity183.88800011.68040.571989
537Astrometry11016.000000NaN14.982010
680Transit Timing Variations2160.000000NaN2119.002011
736Transit Timing Variations257.011000NaN855.002012
749Transit Timing Variations3NaNNaNNaN2014
787Orbital Brightness Modulation20.240104NaN1180.002011
788Orbital Brightness Modulation20.342887NaN1180.002011
792Orbital Brightness Modulation11.544929NaNNaN2013
813Transit Timing Variations222.339500NaN339.002013
902Microlensing1NaNNaNNaN2008
903Microlensing1NaNNaNNaN2008
904Microlensing1NaNNaNNaN2009
905Microlensing1NaNNaN3600.002013
906Microlensing12780.000000NaNNaN2011
907Microlensing1NaNNaNNaN2010
908Microlensing11970.000000NaNNaN2010
909Microlensing1NaNNaN2300.002012
917Microlensing1NaNNaNNaN2004
918Microlensing13600.000000NaNNaN2005
941Pulsar Timing325.262000NaNNaN1992
942Pulsar Timing366.541900NaNNaN1992
943Pulsar Timing398.211400NaNNaN1994
944Pulsar Timing136525.000000NaNNaN2003
945Pulsar Timing10.090706NaN1200.002011
958Pulsation Timing Variations11170.000000NaNNaN2007
\n", + "

69 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " method number orbital_period mass distance \\\n", + "0 Radial Velocity 1 269.300000 7.100 77.40 \n", + "1 Radial Velocity 1 874.774000 2.210 56.95 \n", + "2 Radial Velocity 1 763.000000 2.600 19.84 \n", + "3 Radial Velocity 1 326.030000 19.400 110.62 \n", + "4 Radial Velocity 1 516.220000 10.500 119.47 \n", + "5 Radial Velocity 1 185.840000 4.800 76.39 \n", + "7 Radial Velocity 1 798.500000 NaN 21.41 \n", + "9 Radial Velocity 2 452.800000 1.990 74.79 \n", + "10 Radial Velocity 2 883.000000 0.860 74.79 \n", + "13 Radial Velocity 3 1078.000000 2.530 14.08 \n", + "16 Radial Velocity 1 4.230785 0.472 15.36 \n", + "17 Radial Velocity 5 14.651000 0.800 12.53 \n", + "20 Radial Velocity 5 0.736540 NaN 12.53 \n", + "25 Radial Velocity 1 116.688400 NaN 18.11 \n", + "26 Radial Velocity 1 691.900000 NaN 81.50 \n", + "29 Imaging 1 NaN NaN 45.52 \n", + "30 Imaging 1 NaN NaN 165.00 \n", + "31 Imaging 1 NaN NaN 140.00 \n", + "32 Eclipse Timing Variations 1 10220.000000 6.050 NaN \n", + "33 Imaging 1 NaN NaN NaN \n", + "34 Imaging 1 NaN NaN 145.00 \n", + "35 Imaging 1 NaN NaN 139.00 \n", + "37 Eclipse Timing Variations 2 5767.000000 NaN 130.72 \n", + "38 Eclipse Timing Variations 2 3321.000000 NaN 130.72 \n", + "39 Eclipse Timing Variations 2 5573.550000 NaN 500.00 \n", + "40 Eclipse Timing Variations 2 2883.500000 NaN 500.00 \n", + "41 Eclipse Timing Variations 1 2900.000000 NaN NaN \n", + "42 Eclipse Timing Variations 1 4343.500000 4.200 NaN \n", + "43 Eclipse Timing Variations 2 5840.000000 NaN NaN \n", + "54 Imaging 1 NaN NaN 52.03 \n", + ".. ... ... ... ... ... \n", + "101 Transit 1 13.240600 NaN 345.00 \n", + "102 Transit 1 2.994330 NaN 560.00 \n", + "103 Transit 1 2.828042 NaN 1150.00 \n", + "104 Transit 1 4.035190 NaN 1060.00 \n", + "113 Astrometry 1 246.360000 NaN 20.77 \n", + "441 Radial Velocity 1 83.888000 11.680 40.57 \n", + "537 Astrometry 1 1016.000000 NaN 14.98 \n", + "680 Transit Timing Variations 2 160.000000 NaN 2119.00 \n", + "736 Transit Timing Variations 2 57.011000 NaN 855.00 \n", + "749 Transit Timing Variations 3 NaN NaN NaN \n", + "787 Orbital Brightness Modulation 2 0.240104 NaN 1180.00 \n", + "788 Orbital Brightness Modulation 2 0.342887 NaN 1180.00 \n", + "792 Orbital Brightness Modulation 1 1.544929 NaN NaN \n", + "813 Transit Timing Variations 2 22.339500 NaN 339.00 \n", + "902 Microlensing 1 NaN NaN NaN \n", + "903 Microlensing 1 NaN NaN NaN \n", + "904 Microlensing 1 NaN NaN NaN \n", + "905 Microlensing 1 NaN NaN 3600.00 \n", + "906 Microlensing 1 2780.000000 NaN NaN \n", + "907 Microlensing 1 NaN NaN NaN \n", + "908 Microlensing 1 1970.000000 NaN NaN \n", + "909 Microlensing 1 NaN NaN 2300.00 \n", + "917 Microlensing 1 NaN NaN NaN \n", + "918 Microlensing 1 3600.000000 NaN NaN \n", + "941 Pulsar Timing 3 25.262000 NaN NaN \n", + "942 Pulsar Timing 3 66.541900 NaN NaN \n", + "943 Pulsar Timing 3 98.211400 NaN NaN \n", + "944 Pulsar Timing 1 36525.000000 NaN NaN \n", + "945 Pulsar Timing 1 0.090706 NaN 1200.00 \n", + "958 Pulsation Timing Variations 1 1170.000000 NaN NaN \n", + "\n", + " year \n", + "0 2006 \n", + "1 2008 \n", + "2 2011 \n", + "3 2007 \n", + "4 2009 \n", + "5 2008 \n", + "7 1996 \n", + "9 2010 \n", + "10 2010 \n", + "13 1996 \n", + "16 1995 \n", + "17 1996 \n", + "20 2011 \n", + "25 1996 \n", + "26 2012 \n", + "29 2005 \n", + "30 2007 \n", + "31 2004 \n", + "32 2009 \n", + "33 2008 \n", + "34 2013 \n", + "35 2004 \n", + "37 2008 \n", + "38 2008 \n", + "39 2010 \n", + "40 2010 \n", + "41 2011 \n", + "42 2012 \n", + "43 2011 \n", + "54 2012 \n", + ".. ... \n", + "101 2010 \n", + "102 2010 \n", + "103 2010 \n", + "104 2010 \n", + "113 2013 \n", + "441 1989 \n", + "537 2010 \n", + "680 2011 \n", + "736 2012 \n", + "749 2014 \n", + "787 2011 \n", + "788 2011 \n", + "792 2013 \n", + "813 2013 \n", + "902 2008 \n", + "903 2008 \n", + "904 2009 \n", + "905 2013 \n", + "906 2011 \n", + "907 2010 \n", + "908 2010 \n", + "909 2012 \n", + "917 2004 \n", + "918 2005 \n", + "941 1992 \n", + "942 1992 \n", + "943 1994 \n", + "944 2003 \n", + "945 2011 \n", + "958 2007 \n", + "\n", + "[69 rows x 6 columns]" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 分解動作查看,這個 groupby 我就已經不熟了。。。前面有見過,不太理解 [ ] \n", + "planets.groupby(['method', decade]).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2731,15 +4054,12 @@ "Transit Timing Variations 0.0 0.0 0.0 9.0" ] }, - "execution_count": 30, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "decade = 10 * (planets['year'] // 10)\n", - "decade = decade.astype(str) + 's'\n", - "decade.name = 'decade'\n", "planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)" ] }, @@ -2748,7 +4068,7 @@ "metadata": {}, "source": [ "This shows the power of combining many of the operations we've discussed up to this point when looking at realistic datasets.\n", - "We immediately gain a coarse understanding of when and how planets have been discovered over the past several decades!\n", + "We immediately gain a coarse 粗糙的 understanding of when and how planets have been discovered over the past several decades!\n", "\n", "Here I would suggest digging into these few lines of code, and evaluating the individual steps to make sure you understand exactly what they are doing to the result.\n", "It's certainly a somewhat complicated example, but understanding these pieces will give you the means to similarly explore your own data." @@ -2780,7 +4100,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.6.0" } }, "nbformat": 4,
decade1980s1990s2000s2010s
method