-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathEx_linearreg.py
105 lines (63 loc) · 2.53 KB
/
Ex_linearreg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 1 15:08:19 2018
@author: suvasama
"""
#------------------------------------------------------------------------------
# IMPORT LIBRARIES
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
# k-fold cross validation
from sklearn.cross_validation import cross_val_score
import numpy as np
#------------------------------------------------------------------------------
# IMPORT DATA
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col = 0)
# display first and last 5 rows, shape of data frame
print(data.head()); print(data.tail()); print(data.shape)
# Draw scatter plots of the data with a regression line
sns.pairplot(data,x_vars = ['TV', 'radio', 'newspaper'], y_vars = 'sales', size = 7, aspect = 0.7, kind = 'reg')
plt.show() # prints the figure here
#------------------------------------------------------------------------------
# PREPARE THE DATA
X = data[['TV', 'radio', 'newspaper']]; y = data.sales
print(X.head()); print(y.head()); print(type(X)); print(type(y)); print(X.shape); print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
#------------------------------------------------------------------------------
# RUN A LINEAR REGRESSION ON DATA
# initiate
linreg = LinearRegression()
# fit into the training data
linreg.fit(X_train, y_train)
# display the results
print(linreg.intercept_)
coef_names = ['TV', 'radio', 'newspaper']; coefs = zip(coef_names, linreg.coef_); coefs = list(coefs)
print(coefs)
print("")
#------------------------------------------------------------------------------
# FEATURE SELECTION USING CROSS-VALIDATION
scores = cross_val_score(linreg, X, y, cv = 10, scoring = 'neg_mean_squared_error')
print("scores:")
print(scores)
print("")
# fix the sign of MSE scores
print("MSE scores:")
mse_scores = - scores; print(mse_scores)
print("")
# Convert MSE to RMSE
print("RMSE scores:")
rmse_scores = np.sqrt(mse_scores); print(rmse_scores)
print("")
# calculate the average RMSE
print("Mean of RMSE scores:")
print(rmse_scores.mean())
print("")
# 10-fold cross-validation with two features (excluding newspaper)
print("Mean of RMSE scores in the model without newspaper:")
feature_cols = ['TV', 'radio']; X = data[feature_cols]
print(np.sqrt(-cross_val_score(linreg, X, y, cv = 10, scoring = 'neg_mean_squared_error')).mean())
print("")