forked from campbwa/R-videos
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathCV random forest vs gbm.R
195 lines (95 loc) · 4.46 KB
/
CV random forest vs gbm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#If you don't cross validate, you are not obtaining a realistic measure
#of your models predictive accuracy!
#Anyone can make a model that fits the data perfectly, but has terrible
#out of sample accuracy. (overfitting)
#at least two types of classification error can be used:
#1. area under the ROC curve (AUC)
#2. missclassification rate
#A reliable and valid AUC estimate can be interpreted as the
#probability that the classifier will assign
#a higher score to a randomly chosen positive
#example than to a randomly chosen negative example.
#load the package to determine AUC:
require(verification)
?roc.area
#packages for the models that I'm going to use
require(randomForest)
require(gbm)
setwd("~/Old Kaggle comps/amazon")
train = read.csv("train.csv")
head(train)
#for unbalanced datasets, the "most useful" measure is AUC
mean(train$ACTION) #94% of ACTION is equal to 1
#################### partition the data #####################
#there's a function in plyr that will do this, but it's easy to do your own
#for k-fold CV, you create k different partitions in the data
#I'm assuming that my data are already in a random order
k = 10
n = floor(nrow(train)/k) #n is the size of each fold
#I rounded down to avoid going out of bounds on the last fold
err.vect = rep(NA,k) #store the error in this vector
#how to partition the first fold
i = 1
s1 = ((i - 1) * n+1) #the start of the subset
s2 = (i * n) #the end of the subset
subset = s1:s2 #the range of the subset
#because of rounding, the end of the subset may be slighly out of range
cv.train = train[-subset,] #train the model using this data
cv.test = train[subset,] #test the model's performance on this data
#to do "standard" CV, we could just run the model on the cv.train data
#and test it on the cv.test data
#k-fold CV allows us to use all of the data for the final model
#but still have realistic model performance estimates
#next, move to the second fold:
i = 2
#...
##############################################################
########################### CV for random forest ############################
#need to loop over each of the folds
for(i in 1:k){
s1 = ((i - 1) * n+1) #the start of the subset
s2 = (i * n) #the end of the subset
subset = s1:s2 #the range of the subset
cv.train = train[-subset,] #train the model using this data
cv.test = train[subset,] #test the model's performance on this data
#run the random forest on the train set
fit = randomForest(x = cv.train[,-1], y = as.factor(cv.train[,1]))
#make predictions on the test set
prediction = predict(fit, newdata = cv.test[,-1], type = "prob")[,2]
#calculate the model's accuracy for the ith fold
err.vect[i] = roc.area(cv.test[,1], prediction)$A
print(paste("AUC for fold", i, ":", err.vect[i]))
}
print(paste("Average AUC:", mean(err.vect)))
#each fold has a different error rate,
#and that's why you do k-fold CV!
##############################################################################
########################### CV for gbm ############################
ntrees = 1000 #the default is only 100
for(i in 1:k){
s1 = ((i - 1) * n+1) #the start of the subset
s2 = (i * n) #the end of the subset
subset = s1:s2 #the range of the subset
cv.train = train[-subset,]
cv.test = train[subset,] #test the model's performance on this data
#estimate the gbm on the cv.train set
fit = gbm.fit(x = cv.train[,-1], y = cv.train[,1],
n.trees = ntrees, verbose = FALSE, shrinkage = 0.005,
interaction.depth = 20, n.minobsinnode = 5, distribution = "bernoulli")
#use bernoulli or adaboost for classification problems
#make predictions on the test set
prediction = predict(fit, newdata = cv.test[,-1], n.trees = ntrees)
err.vect[i] = roc.area(cv.test[,1], prediction)$A
print(paste("AUC for fold", i, ":", err.vect[i]))
}
print(paste("Average AUC:", mean(err.vect)))
#conclusion: a random forest is better for this data set!
#(and for these parameters)
#only needed to change the model fit and prediction lines of the code!
##################################################################
################ What's next? ###################
#you can use this method to optimize the parameters of each model
#GBM has several parameters that must be specified
#compare more types of models
#a better measure of your model's out of sample accuracy
#################################################