forked from jtr5395/regression
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path6-Regression.py
206 lines (160 loc) · 8.79 KB
/
6-Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# coding: utf-8
# # 6. Regression (Linear and Nonlinear)
#
#
# In[2]:
import numpy as np
from matplotlib import pyplot as plt
get_ipython().magic('matplotlib inline')
# ## Linear regression with scipy.stats.linregress
# Check the documentation
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html
#
# `scipy.stats.linregress(x, y=None)`
#
# Calculate a linear least-squares regression for two sets of measurements.
#
# Parameters:
# * x, y : array_like
# Two sets of measurements. Both arrays should have the same length. If only x is given (and y=None), then it must be a two-dimensional array where one dimension has length 2. The two sets of measurements are then found by splitting the array along the length-2 dimension.
#
# Returns:
#
# * slope : float
# slope of the regression line
# * intercept : float
# intercept of the regression line
# * rvalue : float
# correlation coefficient
# * pvalue : float
# two-sided p-value for a hypothesis test whose null hypothesis is that the slope is zero.
# * stderr : float
# Standard error of the estimate
# In[4]:
import scipy.stats
x_data = np.array([0.5, 1.0, 2.0, 3.0, 5.0, 7.5])
y_data = np.array([1.2, 2.8, 5.2, 7.2, 10.6, 20.0])
slope, intercept, rvalue, pvalue, stderr = scipy.stats.linregress(x_data, y_data)
print(slope, intercept, rvalue, pvalue, stderr)
print("Slope: {}".format(slope))
print("Intercept: {}".format(intercept))
print("Coefficient of determination (r squared): {}".format(rvalue*rvalue))
print("p-value (probability that the slope is zero): {}".format(pvalue))
print("Standard error in slope: {}".format(stderr))
plt.plot(x_data, y_data, 'o')
plt.plot(x_data, x_data*slope+intercept)
plt.show()
# Notice that the documentation has a "see also" section that points to optimize.curve_fit. Let's check that out next
# ## Nonlinear regression with scipy.optimize.curve_fit
#
# `scipy.optimize.curve_fit(f, xdata, ydata, p0=None, sigma=None, absolute_sigma=False, check_finite=True, bounds=(-inf, inf), method=None, **kwargs)`
#
# This is quite a powerful function, giving access to several methods, bounds on the parameters, rescaling options, etc., so it's worth reading the documentation:
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html
#
# However, for a simple case it's quite easy to use.
# We define a function `f(x)` such that `ydata = f(xdata, *params) + eps` and it will optimize the parameters `params` to minimize the square of the errors `eps`, i.e. make the function `f` fit the data (`xdata, ydata`).
#
# For the simplest example, we can use it to fit a linear function like we did above. (Though if we *did* want to fit a linear funciton, we'd probably use the method above).
# In[5]:
import scipy.optimize
def linear_function(x, slope, intercept):
return x*slope + intercept
x_data = np.array([0.5, 1.0, 2.0, 3.0, 5.0, 7.5])
y_data = np.array([1.2, 2.8, 5.2, 7.2, 10.6, 20.0])
optimal_parameters, covariance = scipy.optimize.curve_fit(linear_function, x_data, y_data)
# That's it! In this exampe we didn't even need to give it a starting guess!
print("Slope: {}".format(optimal_parameters[0]))
print("Intercept: {}".format(optimal_parameters[1]))
# Use the covariance matrix to find the standard errors.
# The diagonals give the variance, so...
parameter_errors = np.sqrt(np.diag(covariance))
print("Slope: {} +/- {} (1 st. dev.)".format(optimal_parameters[0],parameter_errors[0]))
print("Intercept: {} +/- {} (1 st. dev.)".format(optimal_parameters[1],parameter_errors[1]))
plt.plot(x_data, y_data, 'o')
plt.plot(x_data, linear_function(x_data, *optimal_parameters))
plt.show()
# Now we will try a more complicated function with three parameters, $a, b, c$:
#
# $$y = f(x) = \frac{a x^2}{(1 + b x + c x^2)}$$
#
# I generated some pretend data by using the function with $(a,b,c) = (20, 0.5, 20)$ then adding some random noise and tweaking one of the points slightly. We'll see if we can retrieve the paramaters by fitting the function to my pretend data.
#
# In[7]:
def complicated_function(x, a, b, c):
"A more complicated function with 3 parameters"
return a*x*x / (1 + b*x + c*x*x)
# These data were generated by adding random noise to the original function like this:
#x_data = np.linspace(0,1,11)
#y_data = complicated_function(x,20,0.5,20)*(1+np.random.normal(scale=0.05,size=x.size))+np.random.normal(scale=0.05,size=x.size)
# (then manually tweaked a bit)
x_data = np.array([ 0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
y_data = np.array([ 0.01717205, 0.1005844 , 0.43536816, 0.59862244, 0.76359374,
0.77385322, 0.89904593, 0.86883376, 0.87039673, .95,
0.95870025])
# Fit the parameters to the imperfect x_data and y_data
optimal_parameters, covariance = scipy.optimize.curve_fit(complicated_function,
x_data,
y_data)
def report(optimal_parameters, covariance):
"Make this a function so we can reuse it in cells below"
parameter_errors = np.sqrt(np.diag(covariance))
for i in range(len(optimal_parameters)):
print("Parameter {}: {} +/- {} (1 st. dev.)".format(i,
optimal_parameters[i],
parameter_errors[i]))
# Plot the data
plt.plot(x_data, y_data, 'o', label='data')
# Make a new x array with 50 points for smoother lines
x_many_points = np.linspace(x_data.min(),x_data.max(),50)
# Plot the fitted curve
plt.plot(x_many_points, complicated_function(x_many_points, *optimal_parameters), label='fitted')
# Plot the original curve used to generate the data in the first place
plt.plot(x_many_points, complicated_function(x_many_points,20,0.5,20), ':',label='original')
# Add the legend, in the "best" location to avoid hiding the data
plt.legend(loc='best')
plt.show()
report(optimal_parameters, covariance)
# If we want to enforce all the parameters are positive, we can supply a `bounds` parameter to the `scipy.optimize.curve_fit` function. We use `numpy.inf` for $\infty$.
# In[9]:
# Set the bounds on all the parameters to be 0 and infinity.
optimal_parameters, covariance = scipy.optimize.curve_fit(complicated_function,
x_data,
y_data,
bounds = (0, np.inf)) # BOUNDS!
report(optimal_parameters, covariance)
# We notice the uncertainty on parameter 2 is bigger than parameter 2, i.e. it's possible, given the data, that it is zero. This suggests we can simplify our model and still fit the given data.
#
# One option would just be to enforce the $b$ parameter to be zero (or very close to zero) by specifying different bounds for each parameter:
# In[10]:
# Set the lower and upper bounds on each parameter separately
bounds = ([0,0,0], [np.inf, 1e-6, np.inf])
optimal_parameters, covariance = scipy.optimize.curve_fit(complicated_function,
x_data,
y_data,
bounds = bounds)
report(optimal_parameters, covariance)
# But notice it assigns a big uncertainty to $b$ even though we know (or are assuming) that it is zero. This then changes the uncertainty in $a$ and $c$. It is better to define a new function with only 2 parameters and fit that:
# In[11]:
def simplified_function(x, a, c):
"A slightly simplified function with only 2 parameters: a and c"
return a*x*x / (1 + c*x*x)
optimal_parameters, covariance = scipy.optimize.curve_fit(simplified_function,
x_data,
y_data)
# Can't reuse the 'report' function exactly, so do modify slightly
parameter_errors = np.sqrt(np.diag(covariance))
for i in range(len(optimal_parameters)):
print("Parameter {}: {} +/- {} (1 st. dev.)".format(i,
optimal_parameters[i],
parameter_errors[i]))
plt.plot(x_data, y_data, 'o', label='data')
x_many_points = np.linspace(x_data.min(),x_data.max(),50)
plt.plot(x_many_points, simplified_function(x_many_points, *optimal_parameters), label='fitted')
plt.plot(x_many_points, complicated_function(x_many_points,20,0.5,20), ':',label='original')
plt.legend(loc='best')
plt.show()
# The current data are not sufficient to tell the two-parameter model and three-paramaeter models apart.
#
# Note that the function $f(x)$ could be a function of several variables, i.e. $x$ could be a vector, and `x_data` would be a 2-dimensional array.
# In[ ]: