|
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
-
- path = 'ex1data1.txt'
- data = pd.read_csv(path, header = None, names = ['Population', 'Profit'])
- print(data.head())
- print(data.describe())
- data.plot(kind = 'scatter', x = 'Population', y = 'Profit', figsize = (12,8))
- plt.show()
- def computeCost(X, y, theta): # 构造代价函数
- inner = np.power(((X * theta.T) - y), 2)
- return np.sum(inner) / (2 * len(X))
- data.insert(0, 'Ones', 1) # 在data中添加一列1
- # 变量初始化
- # set X (training data) and y (target variable)
- cols = data.shape[1]
- X = data.iloc[:,0:cols-1] # X是所有行,去掉最后一列
- y = data.iloc[:,cols-1:cols] # X是所有行,最后一列
- print(X.head())
- print(y.head())
- # 初始化
- X = np.matrix(X.values)
- y = np.matrix(y.values)
- theta = np.matrix(np.array([0,0]))
- # 查看数组维度
- print(X.shape, theta.shape, y.shape)
- # 计算代价函数
- print(computeCost(X, y, theta))
- # batch gradient decent 批量梯度下降
- def gradientDescent(X, y, theta, alpha, iters):
- temp = np.matrix(np.zeros(theta.shape))
- parameters = int(theta.ravel().shape[1])
- cost = np.zeros(iters)
-
- for i in range(iters):
- error = (X * theta.T) - y
-
- for j in range(parameters):
- term = np.multiply(error, X[:,j])
- temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))
- theta = temp
- cost[i] = computeCost(X, y, theta)
-
- return theta, cost
- # 初始化附加变量
- alpha = 0.01 # 学习率
- iters = 1000 # 迭代次数上限
- theta, cost = gradientDescent(X, y, theta, alpha, iters)
- print(theta)
- # 最后,我们可以使用拟合的参数计算训练模型的代价函数(误差)
- print(computeCost(X, y, theta))
- # 绘图
- x = np.linspace(data.Population.min(), data.Population.max(), 100)
- f = theta[0, 0] + (theta[0, 1] * x)
- fig, ax = plt.subplots(figsize = (12, 8))
- ax.plot(x, f, 'r', label = 'Prediction')
- ax.scatter(data.Population, data.Profit, label = 'Training Data')
- ax.legend(loc = 2)
- ax.set_xlabel('Population')
- ax.set_ylabel('Profit')
- ax.set_title('Predicted Profit vs. Population Size')
- plt.show()
复制代码
你这个就是个线性回归问题,将你对应的X和Y写成两列,文件名为ex1data1.txt。运行上面代码即可 |
|