commit

Jake Teo · Jake Teo · commit bec620cdf7b0 · 2017-07-01T14:51:25.000+08:00
diff --git a/assumptions.rst b/assumptions.rst
@@ -0,0 +1,17 @@
+Tests for Assumptions
+=====================
+
+Normality
+---------
+
+.. code:: python
+
+  import scipy.stats as stats
+  stats.normaltest(df3['depth'])
+  
+  >>> NormaltestResult(statistic=33363.134206705407, pvalue=0.0)
+
+
+
+Homogeneity of Variances
+------------------------
diff --git a/difference.rst b/difference.rst
@@ -7,6 +7,17 @@ X, Explantory: ``Categorical``
 Y, Response: ``Categorical``
 Type: ``Non-Parametric``
 
+.. code:: python
+
+  print 'chi-square statistic, p-value, expected counts'
+  print ss.chi2_contingency(ct1)
+  
+  chi-square statistic, p-value, expected counts
+  (1263.6306705804054, 2.554837585615145e-272, 4, array([[  7.74251477e+03,   1.71950205e+03,   3.69930718e+02,
+            4.25495413e+01,   2.50291420e+00],
+         [  7.72448523e+03,   1.71549795e+03,   3.69069282e+02,
+            4.24504587e+01,   2.49708580e+00]]))
+
 
 Student's T-Test
 ----------------
@@ -17,4 +28,77 @@ ANOVA
 -----
 Type: ``Parametric``
 
-Analysis of Variance (ANOVA).
+Analysis of Variance (ANOVA).
+
+
+.. code:: python
+
+  #### IMPORT MOUDLES ####
+  import numpy as np
+  import pandas as pd
+  import statsmodels.formula.api as smf
+  import statsmodels.stats.multicomp as multi
+
+
+
+  #### FIT MODEL ####
+  # response~explanatory OR x~y, 'C' refers to categorical variable
+  # ANOVA for multiple factors
+  model = smf.ols(formula='diameter ~ C(layers)', data=df3)
+  results = model.fit()
+  >>> print results.summary()
+
+
+  OLS Regression Results                            
+  ==============================================================================
+  Dep. Variable:               diameter   R-squared:                       0.219
+  Model:                            OLS   Adj. R-squared:                  0.219
+  Method:                 Least Squares   F-statistic:                     1383.
+  Date:                Tue, 02 Aug 2016   Prob (F-statistic):               0.00
+  Time:                        17:04:57   Log-Likelihood:                -60976.
+  No. Observations:               19731   AIC:                         1.220e+05
+  Df Residuals:                   19726   BIC:                         1.220e+05
+  Df Model:                           4                                         
+  Covariance Type:            nonrobust                                         
+  ==================================================================================
+  coef    std err          t      P>|t|      [95.0% Conf. Int.]
+  ----------------------------------------------------------------------------------
+  Intercept          6.7217      0.043    157.125      0.000         6.638     6.806
+  C(layers)[T.2]     3.3941      0.100     33.822      0.000         3.197     3.591
+  C(layers)[T.3]    12.2841      0.200     61.319      0.000        11.891    12.677
+  C(layers)[T.4]    18.3139      0.579     31.649      0.000        17.180    19.448
+  C(layers)[T.5]    21.8123      2.380      9.166      0.000        17.148    26.477
+  ==============================================================================
+  Omnibus:                    14916.319   Durbin-Watson:                   0.529
+  Prob(Omnibus):                  0.000   Jarque-Bera (JB):           577157.627
+  Skew:                           3.262   Prob(JB):                         0.00
+  Kurtosis:                      28.680   Cond. No.                         64.0
+  ==============================================================================
+
+  Warnings:
+  [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
+
+
+
+
+  #### POST-HOC TEST ####
+  mc = multi.MultiComparison(df3['diameter'],df3['layers'])
+  result1 = mc.tukeyhsd()
+  print result1
+  
+  
+  Multiple Comparison of Means - Tukey HSD,FWER=0.05
+  =============================================
+  group1 group2 meandiff  lower   upper  reject
+  ---------------------------------------------
+    1      2     3.3941   3.1204  3.6679  True 
+    1      3    12.2841  11.7376 12.8306  True 
+    1      4    18.3139  16.7353 19.8925  True 
+    1      5    21.8123  15.3204 28.3041  True 
+    2      3      8.89    8.3015  9.4785  True 
+    2      4    14.9198  13.3262 16.5134  True 
+    2      5    18.4181  11.9226 24.9137  True 
+    3      4     6.0298   4.3675  7.6921  True 
+    3      5     9.5281   3.0154 16.0409  True 
+    4      5     3.4984  -3.1806 10.1773 False 
+  ---------------------------------------------
diff --git a/supervised.rst b/supervised.rst
@@ -232,6 +232,53 @@ An ensemble of decision trees.
 
 Logistic Regression
 **************************
+Binary output.
+
+.. code:: python
+  
+  #### IMPORT MODULES ####
+  import pandas as pd
+  import statsmodels.api as sm
+  
+  
+  
+  #### FIT MODEL ####
+  lreg = sm.Logit(df3['diameter_cut'], df3[trainC]).fit()
+  print lreg.summary()
+
+
+  Optimization terminated successfully.
+         Current function value: 0.518121
+         Iterations 6
+                             Logit Regression Results                           
+  ==============================================================================
+  Dep. Variable:           diameter_cut   No. Observations:                18067
+  Model:                          Logit   Df Residuals:                    18065
+  Method:                           MLE   Df Model:                            1
+  Date:                Thu, 04 Aug 2016   Pseudo R-squ.:                  0.2525
+  Time:                        14:13:14   Log-Likelihood:                -9360.9
+  converged:                       True   LL-Null:                       -12523.
+                                          LLR p-value:                     0.000
+  ================================================================================
+                     coef    std err          z      P>|z|      [95.0% Conf. Int.]
+  --------------------------------------------------------------------------------
+  depth            4.2529      0.067     63.250      0.000         4.121     4.385
+  layers_YESNO    -2.1102      0.037    -57.679      0.000        -2.182    -2.039
+  ================================================================================
+  
+  
+  
+  #### CONFIDENCE INTERVALS ####
+  params = lreg.params
+  conf = lreg.conf_int()
+  conf['OR'] = params
+  conf.columns = ['Lower CI', 'Upper CI', 'OR']
+  print (np.exp(conf))
+
+  Lower CI   Upper CI         OR
+  depth         61.625434  80.209893  70.306255
+  layers_YESNO   0.112824   0.130223   0.121212
+
 
 Support Vector Machine
 ***********************