import pandas as pd
import warnings 
# 用来忽略seaborn绘图库产生的warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
%pylab inline

Populating the interactive namespace from numpy and matplotlib

from sklearn import datasets  
diabetes = datasets.load_diabetes()

dir(diabetes)

['DESCR', 'data', 'feature_names', 'target']

diabetes.feature_names
# 10个特征，分别为Age(年龄)、性别(Sex)、Body mass index(体质指数)、Average Blood Pressure(平均血压)、S1~S6一年后疾病级数指标
# Target为一年后患疾病的定量指标

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
df['target']=pd.Series(diabetes.target)
df.head()

sns.pairplot(df, x_vars=diabetes.feature_names, y_vars='target', size=5, aspect=0.8)

<seaborn.axisgrid.PairGrid at 0x11bdbc198>

# 显然，sex、s1、s2、 s4，与 target 没有相关性

#參数kind='reg'。seaborn能够加入一条最佳拟合直线和95%的置信带。
sns.pairplot(df, x_vars=['age', 'bmi', 'bp', 's3', 's5', 's6'], y_vars='target', 
             size=5, aspect=0.8, kind='reg')

<seaborn.axisgrid.PairGrid at 0x11f389550>

import pandas as pd
import warnings 
# 用来忽略seaborn绘图库产生的warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
%pylab inline

Populating the interactive namespace from numpy and matplotlib

from sklearn import datasets  
diabetes = datasets.load_diabetes()
df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
df['target']=pd.Series(diabetes.target)
df.head()

# 拆分数据
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df[diabetes.feature_names], df['target'], random_state=1)
len(x_train),len(x_test),len(y_train),len(y_test)

(331, 111, 331, 111)

from sklearn import linear_model
linear = linear_model.LinearRegression()


# 训练模型
linear.fit(x_train, y_train)

#训练结果
print('Coefficients :\n' , linear.coef_)
print('Intercept: n', linear.intercept_)

Coefficients :
 [  -7.85951708 -245.05253542  575.11667591  323.85372717 -519.77447335
  250.61132753    0.96367294  180.50891964  614.75959394   52.10619986]
Intercept: n 150.997693786
Residual sum of square: 2903.10
variance score: 0.44

# 模型评价
#对于线性回归来说，我们一般用均方差（Mean Squared Error, MSE）
#或者均方根差(Root Mean Squared Error, RMSE)在测试集上的表现来评价模型的好坏。

y_pred = linear.predict(x_test)

# 残差平方和 方差得分  
print("Residual sum of square: %.2f" %np.mean((y_pred - y_test) ** 2))  
print("variance score: %.2f" % linear.score(x_test, y_test))

from sklearn import metrics
# 用scikit-learn计算MSE
print("MSE:",metrics.mean_squared_error(y_test, y_pred))
# 用scikit-learn计算RMSE
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Residual sum of square: 2903.10
variance score: 0.44
MSE: 2903.10000132
RMSE: 53.8804231732

#绘图

plt.scatter(x_test['age'], y_test, color = 'black')  
#预测结果 直线表示  
#plt.plot(x_test['age'], linear.predict(y_test), color='blue', linewidth = 3)  
#plt.show()

<matplotlib.collections.PathCollection at 0x119944f60>

# 交叉验证
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(linear, df[diabetes.feature_names], df['target'], cv=10)

# 用scikit-learn计算MSE
print("MSE:",metrics.mean_squared_error(df['target'], predicted))
# 用scikit-learn计算RMSE
print("RMSE:",np.sqrt(metrics.mean_squared_error(df['target'], predicted)))

MSE: 2999.03228568
RMSE: 54.7634210553

# 可以看出，采用交叉验证模型的MSE更大
# 主要原因是我们这里是对所有折的样本做测试集对应的预测值的MSE，
# 而前面仅仅对25%的测试集做了MSE。两者的先决条件并不同。

import pandas as pd
import warnings 
# 用来忽略seaborn绘图库产生的warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
%pylab inline

Populating the interactive namespace from numpy and matplotlib

from sklearn import datasets  
diabetes = datasets.load_diabetes()
df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
df['target']=pd.Series(diabetes.target)
df.head()

from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn import metrics

# 全特征回归
x_train, x_test, y_train, y_test = train_test_split(df[diabetes.feature_names], df['target'], random_state=1)
linear = linear_model.LinearRegression()
linear.fit(x_train, y_train)

#训练结果
print('Coefficients :\n' , linear.coef_)
print('Intercept: n', linear.intercept_)

# 结果评价
y_pred = linear.predict(x_test)

print("得分: %.2f" % linear.score(x_test, y_test))
print("MAE:",metrics.mean_absolute_error(y_test, y_pred))
# print("MSE: %.2f" %np.mean((y_pred - y_test) ** 2))  
print("MSE:",metrics.mean_squared_error(y_test, y_pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Coefficients :
 [  -7.85951708 -245.05253542  575.11667591  323.85372717 -519.77447335
  250.61132753    0.96367294  180.50891964  614.75959394   52.10619986]
Intercept: n 150.997693786
得分: 0.44
MAE: 41.982920292
MSE: 2903.10000132
RMSE: 53.8804231732

# 部分特征回归
x_train, x_test, y_train, y_test = train_test_split(df[['age', 'bmi', 'bp', 's3', 's5', 's6']], df['target'], random_state=1)
linear = linear_model.LinearRegression()
linear.fit(x_train, y_train)

#训练结果
print('Coefficients :\n' , linear.coef_)
print('Intercept: n', linear.intercept_)

# 结果评价
y_pred = linear.predict(x_test)

print("得分: %.2f" % linear.score(x_test, y_test))
print("MAE:",metrics.mean_absolute_error(y_test, y_pred))
# print("MSE: %.2f" %np.mean((y_pred - y_test) ** 2))  
print("MSE:",metrics.mean_squared_error(y_test, y_pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Coefficients :
 [ -46.89764846  615.70776642  264.87024712 -197.76826798  452.41426038
   22.20090217]
Intercept: n 150.95907108
得分: 0.41
MAE: 43.6662007739
MSE: 3093.0656796
RMSE: 55.61533673

# 可见，人工筛选后的结果更差，说明特征的筛选不合适

import pandas as pd
import warnings 
# 用来忽略seaborn绘图库产生的warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
%pylab inline

Populating the interactive namespace from numpy and matplotlib

# 生成数据
import scipy as sp  
from scipy.stats import norm  

x = np.arange(0, 1, 0.002)  
y = norm.rvs(0, size=500, scale=0.1)  
y = y + x**5
df = pd.DataFrame()
df['x']=x
df['y']=y
df.head()
#plt.scatter(x, y, s=5,figsize=12)
#df..plot(figsize=(12,9))
sns.jointplot(x="x", y="y", data=df, kind='reg', size=9)

<seaborn.axisgrid.JointGrid at 0x103f20b38>

# 评价函数
''''' 均方误差根 '''  
def rmse(y_test, y):  
    return sp.sqrt(sp.mean((y_test - y) ** 2))  
  
''''' 与均值相比的优秀程度，介于[0~1]。0表示不如均值。1表示完美预测.这个版本的实现是参考scikit-learn官网文档  '''  
def R2(y_test, y_true):  
    return 1 - ((y_test - y_true)**2).sum() / ((y_true - y_true.mean())**2).sum()  
  
  
''''' 这是Conway&White《机器学习使用案例解析》里的版本 '''  
def R22(y_test, y_true):  
    y_mean = np.array(y_true)  
    y_mean[:] = y_mean.mean()  
    return 1 - rmse(y_test, y_true) / rmse(y_mean, y_true)

plt.figure(figsize=(12,9))
# 回归级数
degree = [1,2,5,10,20,50,100]
plt.scatter(x, y, s=5)

# 线性回归
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

# 多项式回归
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures  
from sklearn import linear_model  
for d in degree:  
    clf = Pipeline([('poly', PolynomialFeatures(degree=d)),  
                    ('linear', LinearRegression(fit_intercept=False))])  
    clf.fit(x[:, np.newaxis], y)  
    y_test = clf.predict(x[:, np.newaxis])  
  
    # 拟合结果
    #print(clf.named_steps['linear'].coef_)  
    # 评价
    print('level=%d,rmse=%.2f, R2=%.2f, R22=%.2f, clf.score=%.2f' % 
       (d,
        rmse(y_test, y),  
        R2(y_test, y),  
        R22(y_test, y),  
        clf.score(x[:, np.newaxis], y)))      
       
    plt.plot(x, y_test, linewidth=2)  
       
plt.grid()  
plt.legend(degree, loc='upper left')

level=1,rmse=0.18, R2=0.57, R22=0.34, clf.score=0.57
level=2,rmse=0.11, R2=0.83, R22=0.58, clf.score=0.83
level=5,rmse=0.10, R2=0.86, R22=0.62, clf.score=0.86
level=10,rmse=0.10, R2=0.86, R22=0.62, clf.score=0.86
level=20,rmse=0.10, R2=0.86, R22=0.63, clf.score=0.86
level=50,rmse=0.10, R2=0.87, R22=0.63, clf.score=0.87
level=100,rmse=0.10, R2=0.87, R22=0.63, clf.score=0.87

<matplotlib.legend.Legend at 0x10f46f908>

# 结果评价
# 注意，次数过高，容易发生过拟合

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6	target
0	0.038076	0.050680	0.061696	0.021872	-0.044223	-0.034821	-0.043401	-0.002592	0.019908	-0.017646	151.0
1	-0.001882	-0.044642	-0.051474	-0.026328	-0.008449	-0.019163	0.074412	-0.039493	-0.068330	-0.092204	75.0
2	0.085299	0.050680	0.044451	-0.005671	-0.045599	-0.034194	-0.032356	-0.002592	0.002864	-0.025930	141.0
3	-0.089063	-0.044642	-0.011595	-0.036656	0.012191	0.024991	-0.036038	0.034309	0.022692	-0.009362	206.0
4	0.005383	-0.044642	-0.036385	0.021872	0.003935	0.015596	0.008142	-0.002592	-0.031991	-0.046641	135.0

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6	target
0	0.038076	0.050680	0.061696	0.021872	-0.044223	-0.034821	-0.043401	-0.002592	0.019908	-0.017646	151.0
1	-0.001882	-0.044642	-0.051474	-0.026328	-0.008449	-0.019163	0.074412	-0.039493	-0.068330	-0.092204	75.0
2	0.085299	0.050680	0.044451	-0.005671	-0.045599	-0.034194	-0.032356	-0.002592	0.002864	-0.025930	141.0
3	-0.089063	-0.044642	-0.011595	-0.036656	0.012191	0.024991	-0.036038	0.034309	0.022692	-0.009362	206.0
4	0.005383	-0.044642	-0.036385	0.021872	0.003935	0.015596	0.008142	-0.002592	-0.031991	-0.046641	135.0

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6	target
0	0.038076	0.050680	0.061696	0.021872	-0.044223	-0.034821	-0.043401	-0.002592	0.019908	-0.017646	151.0
1	-0.001882	-0.044642	-0.051474	-0.026328	-0.008449	-0.019163	0.074412	-0.039493	-0.068330	-0.092204	75.0
2	0.085299	0.050680	0.044451	-0.005671	-0.045599	-0.034194	-0.032356	-0.002592	0.002864	-0.025930	141.0
3	-0.089063	-0.044642	-0.011595	-0.036656	0.012191	0.024991	-0.036038	0.034309	0.022692	-0.009362	206.0
4	0.005383	-0.044642	-0.036385	0.021872	0.003935	0.015596	0.008142	-0.002592	-0.031991	-0.046641	135.0

心内求法

线性回归

回顾线性回归

机器学习角度的线性回归

scikit-learn糖尿病数据集

用 scikit-learn处理线性回归问题

模型评价和特征选择

多项式回归

参考资料