Skip to content

Commit d1c9f2e

Browse files
committed
add ipynb for linear regression and logistic regression
0 parents  commit d1c9f2e

8 files changed

+366
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"1.linear_regreesion_v1.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"}},"cells":[{"metadata":{"id":"ax1r8W0rU8d1","colab_type":"text"},"cell_type":"markdown","source":["# 线性回归 - Linear Regreesion\n","注意:python版本为3.6\n"]},{"metadata":{"id":"kgTnKLvOU8d2","colab_type":"code","colab":{}},"cell_type":"code","source":["import pandas as pd\n","import seaborn as sns\n","sns.set(context=\"notebook\", style=\"whitegrid\", palette=\"dark\")\n","import matplotlib.pyplot as plt\n","import numpy as np"],"execution_count":0,"outputs":[]},{"metadata":{"id":"Nje4hmN0U8d6","colab_type":"code","colab":{}},"cell_type":"code","source":["df = pd.read_csv('ex1data1.txt', names=['population', 'profit']) # 读取数据并赋予列名"],"execution_count":0,"outputs":[]},{"metadata":{"id":"a9p1GK2iU8d8","colab_type":"code","colab":{}},"cell_type":"code","source":["df.head() # 显示数据前五行"],"execution_count":0,"outputs":[]},{"metadata":{"id":"Ly0Fs3unU8eA","colab_type":"code","colab":{}},"cell_type":"code","source":["df.info() # 打印df的class信息"],"execution_count":0,"outputs":[]},{"metadata":{"id":"WsQ2uPs6muT7","colab_type":"code","colab":{}},"cell_type":"code","source":["df.describe() # 打印df的统计信息"],"execution_count":0,"outputs":[]},{"metadata":{"id":"dtj0pJAOU8eE","colab_type":"text"},"cell_type":"markdown","source":["***\n","# 看下原始数据"]},{"metadata":{"id":"ON7EiaK7U8eE","colab_type":"code","colab":{}},"cell_type":"code","source":["sns.lmplot('population', 'profit', df, size=6, fit_reg=False)\n","plt.show()"],"execution_count":0,"outputs":[]},{"metadata":{"id":"wRWRxgtAU8eH","colab_type":"code","colab":{}},"cell_type":"code","source":["def get_X(df): # 读取特征\n","# \"\"\"\n","# use concat to add intersect feature to avoid side effect\n","# not efficient for big dataset though\n","# \"\"\"\n"," ones = pd.DataFrame({'ones': np.ones(len(df))})#ones是m行1列的dataframe\n"," data = pd.concat([ones, df], axis=1) # 合并数据,根据列合并\n"," return data.iloc[:, :-1].as_matrix() # 这个操作返回 ndarray,不是矩阵\n","\n","\n","def get_y(df):#读取标签\n","# '''assume the last column is the target'''\n"," return np.array(df.iloc[:, -1])#df.iloc[:, -1]是指df的最后一列\n","\n","\n","def normalize_feature(df):\n","# \"\"\"Applies function along input axis(default 0) of DataFrame.\"\"\"\n"," return df.apply(lambda column: (column - column.mean()) / column.std())#特征缩放"],"execution_count":0,"outputs":[]},{"metadata":{"id":"GPFqxv_zU8eJ","colab_type":"text"},"cell_type":"markdown","source":["多变量的假设 h 表示为:${{h}_{\\theta }}\\left( x \\right)={{\\theta }_{0}}+{{\\theta }_{1}}{{x}_{1}}+{{\\theta }_{2}}{{x}_{2}}+...+{{\\theta }_{n}}{{x}_{n}}$。\n","\n","这个公式中有n+1个参数和n个变量,为了使得公式能够简化一些,引入${{x}_{0}}=1$,则公式转化为: ${{h}_{\\theta }}\\left( x \\right)={{\\theta }_{0}x_0}+{{\\theta }_{1}}{{x}_{1}}+{{\\theta }_{2}}{{x}_{2}}+...+{{\\theta }_{n}}{{x}_{n}}$。\n","\n","此时模型中的参数是一个n+1维的向量,任何一个训练实例也都是n+1维的向量,特征矩阵X的维度是 m*(n+1)。 因此公式可以简化为:${{h}_{\\theta }}\\left( x \\right)={{\\theta }^{T}}X$,其中上标T代表矩阵转置。\n"]},{"metadata":{"id":"on8_khfsU8eQ","colab_type":"text"},"cell_type":"markdown","source":["# 计算代价函数\n","$$J\\left( \\theta \\right)=\\frac{1}{2m}\\sum\\limits_{i=1}^{m}{{{\\left( {{h}_{\\theta }}\\left( {{x}^{(i)}} \\right)-{{y}^{(i)}} \\right)}^{2}}}$$\n","\n","其中:\n","\n","$${{h}_{\\theta }}\\left( x \\right)={{\\theta }^{T}}X={{\\theta }_{0}}{{x}_{0}}+{{\\theta }_{1}}{{x}_{1}}+{{\\theta }_{2}}{{x}_{2}}+...+{{\\theta }_{n}}{{x}_{n}}$$"]},{"metadata":{"id":"yomFBPelU8eQ","colab_type":"code","colab":{}},"cell_type":"code","source":["# 查看数据维度\n","data = df\n","X = get_X(data)\n","print(X.shape, type(X))\n","\n","y = get_y(data)\n","print(y.shape, type(y))\n"],"execution_count":0,"outputs":[]},{"metadata":{"id":"AeTfJ6UyU8eT","colab_type":"code","colab":{}},"cell_type":"code","source":["theta = np.zeros(X.shape[1]) # X.shape[1]=2, 代表特征数n\n","print(theta)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"ntnGgUoKU8eV","colab_type":"code","colab":{}},"cell_type":"code","source":["def lr_cost(theta, X, y):\n"," \"\"\" 计算代价函数\n"," X: R(m*n), m 样本数, n 特征数\n"," y: R(m)\n"," theta : R(n), 线性回归的参数\n"," \"\"\"\n"," m = X.shape[0]#m为样本数\n","\n"," inner = X @ theta - y # R(m*1),X @ theta等价于X.dot(theta)\n","\n"," # 1*m @ m*1 = 1*1 in matrix multiplication\n"," # but you know numpy didn't do transpose in 1d array, so here is just a\n"," # vector inner product to itselves\n"," square_sum = inner.T @ inner\n"," cost = square_sum / (2 * m)\n","\n"," return cost"],"execution_count":0,"outputs":[]},{"metadata":{"id":"BPFR1bKOU8eY","colab_type":"code","colab":{}},"cell_type":"code","source":["lr_cost(theta, X, y) # 返回cost的值"],"execution_count":0,"outputs":[]},{"metadata":{"id":"KZ9oOCHrU8eb","colab_type":"text"},"cell_type":"markdown","source":["# 批量梯度下降 - Batch Gradient Decent\n","$$\\begin{aligned}{{\\theta }_{j}} &:={{\\theta }_{j}}-\\alpha \\frac{\\partial }{\\partial {{\\theta }_{j}}}J\\left( \\theta \\right) \\\\ &:= {{\\theta }_{j}}-\\alpha \\frac{1}{m} \\sum^{m}_{i=1}\\left( h_\\theta \\left(x^{(i)}\\right) -y^{(i)} \\right)x^{(i)}_j \\end{aligned}$$\n","注意:对于所有的$j$,需要同时更新$\\theta_j$。"]},{"metadata":{"id":"4_JZqMY3U8ec","colab_type":"code","colab":{}},"cell_type":"code","source":["def gradient(theta, X, y):\n"," \"\"\"\n"," 计算梯度,也就是 J(θ)的偏导数\n"," \"\"\"\n"," m = X.shape[0]\n","\n"," inner = X.T @ (X @ theta - y) # (m,n).T @ (m, 1) -> (n, 1),X @ theta等价于X.dot(theta)\n","\n"," return inner / m"],"execution_count":0,"outputs":[]},{"metadata":{"id":"IRexXn6EU8ee","colab_type":"code","colab":{}},"cell_type":"code","source":["def batch_gradient_decent(theta, X, y, epoch, alpha=0.01):\n"," \"\"\"\n"," 批量梯度下降函数。拟合线性回归,返回参数和代价\n"," epoch: 批处理的轮数\n"," \"\"\"\n"," cost_data = [lr_cost(theta, X, y)]\n"," _theta = theta.copy() # 拷贝一份,不和原来的theta混淆\n","\n"," for _ in range(epoch):\n"," _theta = _theta - alpha * gradient(_theta, X, y)\n"," cost_data.append(lr_cost(_theta, X, y))\n","\n"," return _theta, cost_data"],"execution_count":0,"outputs":[]},{"metadata":{"id":"Lx_dnNrxU8ei","colab_type":"code","colab":{}},"cell_type":"code","source":["epoch = 500\n","final_theta, cost_data = batch_gradient_decent(theta, X, y, epoch)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"kCe-db1AU8en","colab_type":"code","colab":{}},"cell_type":"code","source":["final_theta\n","#最终的theta"],"execution_count":0,"outputs":[]},{"metadata":{"id":"yigjBbL4U8eq","colab_type":"code","colab":{}},"cell_type":"code","source":["cost_data\n","# 看下代价数据"],"execution_count":0,"outputs":[]},{"metadata":{"id":"ZrMX8THHU8et","colab_type":"code","colab":{}},"cell_type":"code","source":["# 计算最终的代价\n","lr_cost(final_theta, X, y)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"B9VOLMPqk2Os","colab_type":"text"},"cell_type":"markdown","source":["scikit-learn model的预测表现"]},{"metadata":{"scrolled":true,"colab_type":"code","id":"5gM2jYk2T0Hv","colab":{}},"cell_type":"code","source":["from sklearn import linear_model\n","model = linear_model.LinearRegression()\n","model.fit(X, y)\n","\n","x = X[:, 1]\n","f = model.predict(X).flatten()\n","\n","plt.scatter(X[:,1], y, label='Traning Data')\n","plt.plot(x, f, 'r', label='Prediction')\n","plt.legend(loc=2)\n","plt.show()"],"execution_count":0,"outputs":[]},{"metadata":{"id":"rsQw66Y9U8ew","colab_type":"text"},"cell_type":"markdown","source":["# 代价数据可视化"]},{"metadata":{"id":"YZyEbK4RU8ew","colab_type":"code","colab":{}},"cell_type":"code","source":["ax = sns.tsplot(cost_data, time=np.arange(epoch+1))\n","ax.set_xlabel('epoch')\n","ax.set_ylabel('cost')\n","plt.show()\n","#可以看到从第二轮代价数据变换很大,接下来平稳了"],"execution_count":0,"outputs":[]},{"metadata":{"id":"5XhEi6SqU8ez","colab_type":"code","colab":{}},"cell_type":"code","source":["b = final_theta[0] # intercept,Y轴上的截距\n","m = final_theta[1] # slope,斜率\n","\n","plt.scatter(data.population, data.profit, label=\"Training data\")\n","plt.plot(data.population, data.population*m + b, 'r', label=\"Prediction\")\n","plt.legend(loc=2)\n","plt.show()"],"execution_count":0,"outputs":[]},{"metadata":{"id":"67DO0QyBU8e2","colab_type":"text"},"cell_type":"markdown","source":["# 3- 选修章节"]},{"metadata":{"id":"Tb-CO9raU8e2","colab_type":"code","colab":{}},"cell_type":"code","source":["raw_data = pd.read_csv('ex1data2.txt', names=['square', 'bedrooms', 'price'])\n","raw_data.head()"],"execution_count":0,"outputs":[]},{"metadata":{"collapsed":true,"id":"7WBXHMXzU8e6","colab_type":"text"},"cell_type":"markdown","source":["# 标准化数据\n","最简单的方法是令:\n","\n"," \n","\n","其中 是平均值,sn 是标准差。\n"]},{"metadata":{"id":"sFLN1gSfU8e7","colab_type":"code","colab":{}},"cell_type":"code","source":["def normalize_feature(df):\n","# \"\"\"Applies function along input axis(default 0) of DataFrame.\"\"\"\n"," return df.apply(lambda column: (column - column.mean()) / column.std())"],"execution_count":0,"outputs":[]},{"metadata":{"id":"SL2V8gRmU8e-","colab_type":"code","colab":{}},"cell_type":"code","source":["data = normalize_feature(raw_data)\n","data.head()"],"execution_count":0,"outputs":[]},{"metadata":{"id":"pjP2SsV3U8fE","colab_type":"text"},"cell_type":"markdown","source":["# 2. 多变量批量梯度下降 - Multi-var batch gradient decent"]},{"metadata":{"id":"XHTA9KcXU8fE","colab_type":"code","colab":{}},"cell_type":"code","source":["X = get_X(data)\n","print(X.shape, type(X))\n","\n","y = get_y(data)\n","print(y.shape, type(y)) #看下数据的维度和类型"],"execution_count":0,"outputs":[]},{"metadata":{"id":"PUHGSR5jU8fI","colab_type":"code","colab":{}},"cell_type":"code","source":["alpha = 0.01 #学习率\n","theta = np.zeros(X.shape[1]) #X.shape[1]:特征数n\n","epoch = 500 #迭代次数"],"execution_count":0,"outputs":[]},{"metadata":{"id":"NbC5-6NfU8fL","colab_type":"code","colab":{}},"cell_type":"code","source":["final_theta, cost_data = batch_gradient_decent(theta, X, y, epoch, alpha=alpha)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"DTvLgpImU8fN","colab_type":"code","colab":{}},"cell_type":"code","source":["sns.tsplot(time=np.arange(len(cost_data)), data = cost_data)\n","plt.xlabel('epoch', fontsize=18)\n","plt.ylabel('cost', fontsize=18)\n","plt.show()"],"execution_count":0,"outputs":[]},{"metadata":{"id":"XIAeC3aWU8fQ","colab_type":"code","colab":{}},"cell_type":"code","source":["final_theta"],"execution_count":0,"outputs":[]},{"metadata":{"id":"xhv7H1OSTZJs","colab_type":"text"},"cell_type":"markdown","source":["Scikit-learn 的预测"]},{"metadata":{"scrolled":true,"id":"Dx3gtarBk2Os","colab_type":"code","colab":{}},"cell_type":"code","source":["from mpl_toolkits.mplot3d import Axes3D\n","from sklearn import linear_model\n","\n","model = linear_model.LinearRegression()\n","model.fit(X, y)\n","\n","f = model.predict(X).flatten()\n","\n","fig = plt.figure()\n","ax = fig.add_subplot(111, projection='3d')\n","ax.plot(X[:,1], X[:,2], f, 'r', label='Prediction')\n","ax.scatter(X[:,1], X[:,2], y, label='Traning Data')\n","ax.legend(loc=2)\n","ax.set_xlabel('square')\n","ax.set_ylabel('bedrooms')\n","ax.set_zlabel('price')\n","ax.set_title('square & bedrooms vs. price')\n","#ax.view_init(30, 10)\n","plt.show()"],"execution_count":0,"outputs":[]},{"metadata":{"id":"IRrnttEHU8fS","colab_type":"text"},"cell_type":"markdown","source":["# 3. 学习率 - Learning rate"]},{"metadata":{"id":"w9U-3YXMU8fT","colab_type":"code","colab":{}},"cell_type":"code","source":["base = np.logspace(-1, -5, num=4)\n","candidate = np.sort(np.concatenate((base, base*3)))\n","print(candidate)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"ePELIHY-U8fV","colab_type":"code","colab":{}},"cell_type":"code","source":["epoch=50\n","\n","fig, ax = plt.subplots(figsize=(8, 8))\n","\n","for alpha in candidate:\n"," _, cost_data = batch_gradient_decent(theta, X, y, epoch, alpha=alpha)\n"," ax.plot(np.arange(epoch+1), cost_data, label=alpha)\n","\n","ax.set_xlabel('epoch', fontsize=12)\n","ax.set_ylabel('cost', fontsize=12)\n","ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n","ax.set_title('learning rate', fontsize=12)\n","plt.show()"],"execution_count":0,"outputs":[]},{"metadata":{"id":"SwpPCazZg3Ik","colab_type":"text"},"cell_type":"markdown","source":["可以看到最合适的learning rate是0.3"]},{"metadata":{"id":"6-HlM4onU8fX","colab_type":"text"},"cell_type":"markdown","source":["# 4. 正规方程 - Normal equation\n","正规方程是通过求解下面的方程来找出使得代价函数最小的参数的:$\\frac{\\partial }{\\partial {{\\theta }_{j}}}J\\left( {{\\theta }_{j}} \\right)=0$ 。\n"," 假设我们的训练集特征矩阵为 X(包含了${{x}_{0}}=1$)并且我们的训练集结果为向量 y,则利用正规方程解出向量 $\\theta ={{\\left( {{X}^{T}}X \\right)}^{-1}}{{X}^{T}}y$ 。\n","上标T代表矩阵转置,上标-1 代表矩阵的逆。设矩阵$A={{X}^{T}}X$,则:${{\\left( {{X}^{T}}X \\right)}^{-1}}={{A}^{-1}}$\n","\n","梯度下降与正规方程的比较:\n","\n","梯度下降:需要选择学习率α,需要多次迭代,当特征数量n大时也能较好适用,适用于各种类型的模型\t\n","\n","正规方程:不需要选择学习率α,一次计算得出,需要计算${{\\left( {{X}^{T}}X \\right)}^{-1}}$,如果特征数量n较大则运算代价大,因为矩阵逆的计算时间复杂度为O(n3),通常来说当n小于10000 时还是可以接受的,只适用于线性模型,不适合逻辑回归模型等其他模型\n","\n"]},{"metadata":{"id":"BGjQLe7jU8fY","colab_type":"code","colab":{}},"cell_type":"code","source":["# 正规方程\n","def normalEqn(X, y):\n"," theta = np.linalg.inv(X.T@X)@X.T@y#X.T@X等价于X.T.dot(X)\n"," return theta"],"execution_count":0,"outputs":[]},{"metadata":{"id":"3JB8AH_iU8fa","colab_type":"code","colab":{}},"cell_type":"code","source":["final_theta2=normalEqn(X, y)#感觉和批量梯度下降的theta的值有点差距\n","final_theta2"],"execution_count":0,"outputs":[]},{"metadata":{"id":"YfBLlOZnY6Qi","colab_type":"code","colab":{}},"cell_type":"code","source":["f = final_theta2[0] + final_theta2[1] * X[:,1] + final_theta2[2] * X[:,2]\n","\n","fig = plt.figure()\n","ax = fig.add_subplot(111, projection='3d')\n","ax.plot(X[:,1], X[:,2], f, 'r', label='Prediction')\n","ax.scatter(X[:,1], X[:,2], y, label='Traning Data')\n","ax.legend(loc=2)\n","ax.set_xlabel('square')\n","ax.set_ylabel('bedrooms')\n","ax.set_zlabel('price')\n","ax.set_title('square & bedrooms vs. price')\n","#ax.view_init(30, 10)\n","plt.show()"],"execution_count":0,"outputs":[]},{"metadata":{"id":"uB1oLn6lZOzo","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]}]}

1.linear_regression/ex1data1.txt

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
6.1101,17.592
2+
5.5277,9.1302
3+
8.5186,13.662
4+
7.0032,11.854
5+
5.8598,6.8233
6+
8.3829,11.886
7+
7.4764,4.3483
8+
8.5781,12
9+
6.4862,6.5987
10+
5.0546,3.8166
11+
5.7107,3.2522
12+
14.164,15.505
13+
5.734,3.1551
14+
8.4084,7.2258
15+
5.6407,0.71618
16+
5.3794,3.5129
17+
6.3654,5.3048
18+
5.1301,0.56077
19+
6.4296,3.6518
20+
7.0708,5.3893
21+
6.1891,3.1386
22+
20.27,21.767
23+
5.4901,4.263
24+
6.3261,5.1875
25+
5.5649,3.0825
26+
18.945,22.638
27+
12.828,13.501
28+
10.957,7.0467
29+
13.176,14.692
30+
22.203,24.147
31+
5.2524,-1.22
32+
6.5894,5.9966
33+
9.2482,12.134
34+
5.8918,1.8495
35+
8.2111,6.5426
36+
7.9334,4.5623
37+
8.0959,4.1164
38+
5.6063,3.3928
39+
12.836,10.117
40+
6.3534,5.4974
41+
5.4069,0.55657
42+
6.8825,3.9115
43+
11.708,5.3854
44+
5.7737,2.4406
45+
7.8247,6.7318
46+
7.0931,1.0463
47+
5.0702,5.1337
48+
5.8014,1.844
49+
11.7,8.0043
50+
5.5416,1.0179
51+
7.5402,6.7504
52+
5.3077,1.8396
53+
7.4239,4.2885
54+
7.6031,4.9981
55+
6.3328,1.4233
56+
6.3589,-1.4211
57+
6.2742,2.4756
58+
5.6397,4.6042
59+
9.3102,3.9624
60+
9.4536,5.4141
61+
8.8254,5.1694
62+
5.1793,-0.74279
63+
21.279,17.929
64+
14.908,12.054
65+
18.959,17.054
66+
7.2182,4.8852
67+
8.2951,5.7442
68+
10.236,7.7754
69+
5.4994,1.0173
70+
20.341,20.992
71+
10.136,6.6799
72+
7.3345,4.0259
73+
6.0062,1.2784
74+
7.2259,3.3411
75+
5.0269,-2.6807
76+
6.5479,0.29678
77+
7.5386,3.8845
78+
5.0365,5.7014
79+
10.274,6.7526
80+
5.1077,2.0576
81+
5.7292,0.47953
82+
5.1884,0.20421
83+
6.3557,0.67861
84+
9.7687,7.5435
85+
6.5159,5.3436
86+
8.5172,4.2415
87+
9.1802,6.7981
88+
6.002,0.92695
89+
5.5204,0.152
90+
5.0594,2.8214
91+
5.7077,1.8451
92+
7.6366,4.2959
93+
5.8707,7.2029
94+
5.3054,1.9869
95+
8.2934,0.14454
96+
13.394,9.0551
97+
5.4369,0.61705

1.linear_regression/ex1data2.txt

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
2104,3,399900
2+
1600,3,329900
3+
2400,3,369000
4+
1416,2,232000
5+
3000,4,539900
6+
1985,4,299900
7+
1534,3,314900
8+
1427,3,198999
9+
1380,3,212000
10+
1494,3,242500
11+
1940,4,239999
12+
2000,3,347000
13+
1890,3,329999
14+
4478,5,699900
15+
1268,3,259900
16+
2300,4,449900
17+
1320,2,299900
18+
1236,3,199900
19+
2609,4,499998
20+
3031,4,599000
21+
1767,3,252900
22+
1888,2,255000
23+
1604,3,242900
24+
1962,4,259900
25+
3890,3,573900
26+
1100,3,249900
27+
1458,3,464500
28+
2526,3,469000
29+
2200,3,475000
30+
2637,3,299900
31+
1839,2,349900
32+
1000,1,169900
33+
2040,4,314900
34+
3137,3,579900
35+
1811,4,285900
36+
1437,3,249900
37+
1239,3,229900
38+
2132,4,345000
39+
4215,4,549000
40+
2162,4,287000
41+
1664,2,368500
42+
2238,3,329900
43+
2567,4,314000
44+
1200,3,299000
45+
852,2,179900
46+
1852,4,299900
47+
1203,3,239500

2.logistic_regression/ex2.logistic_regression.ipynb

+1
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)