-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstacked_gen.py
executable file
·416 lines (345 loc) · 18.6 KB
/
stacked_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# -------------------------------------------------------------------------------
# Name: Stacked Generaliation
# Purpose: This program implements Stacked Generalization (explained in
# Wolpert,D.H., 1992). Instead of using a specified classifier
# though for 'Level Zero' and 'Level One' generalizatoin, we use
# Bucket of Models to get the optimal classifier.
#
# This program also implements Cross Validation to test the
# performance of the classifier system.
#
# Author: Garrett Reynolds
#
# -------------------------------------------------------------------------------
from __future__ import print_function
from sklearn import svm
from sklearn.cross_validation import (StratifiedKFold, cross_val_score,
LeaveOneOut)
from math import log10
import numpy as np
# Import two functions from the file extra_clf_tools.py
from extra_clf_tools import find_best_clf, equalize_training_set
def test_stack_gen_perf(data_sets, target, cv_eval=None,
clf_list_L0=['svm_rbf'], par_n_folds_L0=4,
iterations_L0=1,
clf_list_L1=['svm_rbf', 'svm_linear', 'svm_poly'],
par_n_folds_L1=20, iterations_L1=4,
equalize_training=False, verbose=False):
"""Uses cross-validation to test performance of stacked generalization
The cross-validation for the evaluation of the performance is cv_eval and
is a SKLearn CV iterator object.
For each individual training set, find_best_clf is used to optimize the
choice of classifier as well as parameters
(ie we use "Bucket of Models", see Wiki's "Ensemble Learning").
INPUT:
>data_sets: list of arrays. Each array represents a feature set
and should be NxD where N is the number of datapoints
in training set and D is the number of features
(may be different for each element of list)
>target: Nx1 array.
>cv_eval: the cross-validation used for evaluated performance;
an SKLearn CV iterator
(default: 10-fold Stratified K-Fold CV)
>clf_list_L0: a list of the names of classifiers that should be
considered for Level 0 generalization.
Possible names: 'all','svm_rbf', 'svm_linear',
'svm_poly', 'svm_sigmoid', 'log_reg', 'random_forest'
Note: L0 generalization will happen many times, so don't
pick something too computationally expensive.
>par_n_folds_L0: Number of folds for the Stratified K-fold
cross-validation used for estimating parameters
(default=4). Unlike with cv_eval argument, you don't
get flexibility to choose type of cross-validation.
>iterations_L0: How many times the parameter search grid should be
redrawn. Again, since L0 generalization will be
executed many times, it's probably best to leave this
at its default value of 1.
>clf_list_L1, par_n_folds_L1, iterations_L1:
Similar to the above arguments, but for Level 1
generalization, which only happens once, so this
these may be more computationally intensive.
>equalize_training: If true, it will reduce the training set of each
fold such that all classes are represented equally
(default=false).
OUTPUT:
> results: a dictionary containing the mean accuracy and the score for
each class individually."""
# First turn targets into a nice form (ie counting from 0 up) and
# remember what the original labels were for output at end.
class_labels, target = np.unique(target, return_inverse=True)
num_classes = len(class_labels)
# check if we should set cv_eval to default
if cv_eval is None:
# set the default number of folds
default_num_folds = 10
# Quick error check
smallest_cl_size = np.min(np.bincount(target))
if smallest_cl_size < default_num_folds:
print("Warning: the smallest class of your data has just " +
str(smallest_cl_size) + " datapoints which is insufficient "
"for the default cross-validation (which requires "
"at least " + str(default_num_folds) + " datapoints).")
print("I've reduced the number of folds in the cross-validation "
"from " + str(default_num_folds) + " to " +
str(smallest_cl_size) + ", so it should be fine.")
default_num_folds = smallest_cl_size
cv_eval = StratifiedKFold(target, default_num_folds)
# Initialize a variable to hold individual class scores
# (list of empty lists for now)
class_scores = [[] for x in range(num_classes)]
# count folds for verbose output
fold_count = 0
# Create the Bucket Of Models objects for Level 0 generalization.
# Note: probability must be set to True (see Wolpert '92)
BOM_L0 = BucketOfModels(clf_list=clf_list_L0, iterations=iterations_L0,
par_n_folds=par_n_folds_L0, probability=True)
# Create the Bucket Of Models objects for Level 1 generalization.
BOM_L1 = BucketOfModels(clf_list=clf_list_L1, iterations=iterations_L1,
par_n_folds=par_n_folds_L1, probability=False)
# Now perform cross validation
for train_indices, test_indices in cv_eval:
fold_count += 1
# create training set
train_data_sets = [
data_set_i[train_indices, :] for data_set_i in data_sets
]
train_target = target[train_indices]
# create testing set
test_data_sets = [
data_set_i[test_indices, :] for data_set_i in data_sets
]
test_target = target[test_indices]
# Make training set proportional to classes
if equalize_training:
# get the indices of an equalized training set
eq_train_indices = equalize_training_set(train_target)
# Now make the equalized training sets
train_data_sets = [
train_data_set_i[eq_train_indices, :] for
train_data_set_i in train_data_sets
]
train_target = train_target[eq_train_indices]
# ERROR-CHECKING: see if par_n_folds_L0 or par_n_folds_L1 are larger
# than number of elements in smallest class of train_target
# and fix them if they are.
_, tr_targ_sorted = np.unique(train_target, return_inverse=True)
smallest_cl_size = np.min(np.bincount(tr_targ_sorted))
# The reason this '-1' is here is because in L0 generalization,
# we actually leave out one datapoint from training set while
# converting L0 training set to L1 training set.
if par_n_folds_L0 > smallest_cl_size-1:
par_n_folds_L0 = smallest_cl_size-1
BOM_L0.par_n_folds = par_n_folds_L0
print("Warning: the parameter par_n_folds_L0 was too big, so I"
"reduced it to be: " + str(par_n_folds_L0) +
". No problem though.")
if par_n_folds_L1 > smallest_cl_size:
par_n_folds_L1 = smallest_cl_size
BOM_L1.par_n_folds = par_n_folds_L1
print("Warning: the parameter par_n_folds_L1 was too big, "
"so I reduced it to be: " + str(par_n_folds_L1) +
". No problem though.")
if verbose:
print("\nStarting on fold number: " + str(fold_count))
# make predictions for test set
predictions = predict_stack_gen(train_data_sets, train_target,
test_data_sets, BOM_L0, BOM_L1,
verbose)
temp_scores = []
for i in range(len(test_target)):
# for the i-th test point, determine what class it belongs to
temp_class = target[test_indices[i]]
# store either 0 or 1
temp_success = int(predictions[i] == test_target[i])
# into the appropriate class score sublist
class_scores[temp_class].append(temp_success)
# and for calculating the mean, we save the value
temp_scores.append(temp_success)
if verbose:
print("The performance for the current " +
str(int(len(test_target))) + " holdout datapoints was: " +
str(round(np.mean(temp_scores)*100, 1)) + "%\n")
####
# End of Cross Validation Loop
####
# we flatten the class scores (which is a list of lists)
all_scores = [item for sublist in class_scores for item in sublist]
accuracy = np.mean(np.array(all_scores))
# start creating the dictionary of results to be returned
results = {'accuracy': accuracy}
for i in range(num_classes):
mean_for_class_i = np.mean(class_scores[i])
results.update({class_labels[i]: mean_for_class_i})
return results
############################
# End of test_stack_gen_perf
############################
def predict_stack_gen(train_data_sets, train_target, test_data_sets,
BOM_L0, BOM_L1, verbose=False):
"""Make predictions using stacked generalization on the test_data after
training with all the training sets.
INPUT:
>train_data_sets: list of arrays. Each array represents a feature set
and should be NxD where N is number of datapoints
in training set and D is number of features
(may be different for each element of list)
>train_target: Nx1 array.
>test_data: list of arrays. Each array is MxD, where D can vary for
each element of list.
>BOM_L0: The BucketOfModels object for Level 0 generalization.
>BOM_L1: BucketOfModels object for Level 1 generalization. This one
have parameters that are more computationally intensive since
it's only run once.
>verbose: Verbose output if set to True (default: False)
OUTPUT:
>predictions: Mx1 array of class predictions. If BOM_L1.probability is
True, then you'll get back probabilistic predictions."""
# Get number of datapoints in training and test sets.
num_train = train_data_sets[0].shape[0]
num_test = test_data_sets[0].shape[0]
# Get number of datasets, which equals the number of generalizers we'll be
# using in Level 0 generalization
num_sets = len(train_data_sets)
# Initialize the L1 training set and L1 testing set
train_data_L1 = np.zeros((num_train, num_sets))
test_data_L1 = np.zeros((num_test, num_sets))
# Set these to -1's so it'll be easier to see if some are never acted on.
train_data_L1.fill(-1)
test_data_L1.fill(-1)
# Go through each set, converting it to L1 datapoints and adding it to the
# L1 data array
for current_set in range(num_sets):
partial_train_data_L1, partial_test_data_L1 = \
convert_train_and_test_to_L1(
train_data_sets[current_set], train_target,
test_data_sets[current_set], BOM_L0, verbose
)
train_data_L1[:, current_set] = partial_train_data_L1
test_data_L1[:, current_set] = partial_test_data_L1
#####
# Now we do Level 1 generalization
####
clf_L1 = BOM_L1.train_using_BOM(train_data_L1, train_target)
if verbose:
print("The classifier used on the Level 1 test set to get "
"final predictions was: " + str(clf_L1))
# Decide whether to give probabilistic classification predictions
if hasattr(clf_L1, 'probability') and clf_L1.probability is True:
# (we have two indices because predict_proba strangely
# gives a 2D array output)
predictions = clf_L1.predict_proba(test_data_L1)[:, 1]
else:
predictions = clf_L1.predict(test_data_L1)
return predictions
def convert_train_and_test_to_L1(train_data_L0, train_target, test_data_L0,
BOM_L0, verbose=False):
"""This converts one of the feature sets which is already split into
training and testing to Level 1.
INPUT:
>train_data_L0: Training data, NxD array, where N is the number of
datapoints in training set and D is the number of
features for this feature set.
>train_target: The targets corresponding to the training datapoints.
here should only be two unique elements in this array,
since the Bucket of Models it uses is meant for binary
classification. Nx1 array.
>test_data_L0: Testing data, MxD array, where M is number of
datapoints in testing set
>BOM_L0: The BucketOfModels object that will be used for Level 0
generalization.
>verbose: Verbose output.
OUTPUT:
>partial_train_data_L1: The training data for Level 1, Nx1 array.
This is only one of the features of the Level 1
training data corresponding to one of the
feature sets, so it will be combined with the
results of the other feature sets to make the
full Level 1 training set before the
final generalization.
>partial_test_data_L1: Mx1 array."""
# Reformat the training and test data if it has just 1 feature, since it
# causes Scikit-Learn's cross_val_score(.) to crash
if len(train_data_L0.shape) == 1:
train_data_L0 = np.array([[i] for i in train_data_L0])
if len(test_data_L0.shape) == 1:
test_data_L0 = np.array([[i] for i in test_data_L0])
# Find number of training datapoints and test datapoints.
num_train_points = train_data_L0.shape[0]
num_test_points = test_data_L0.shape[0]
# Initialize the output arrays
train_data_partial_L1 = np.zeros(num_train_points)
test_data_partial_L1 = np.zeros(num_test_points)
# Fill it with -1's to more easily see if there is an error down the line
train_data_partial_L1.fill(-1)
test_data_partial_L1.fill(-1)
#####
# Now, for each datapoint or training set, we leave it out, train the
# remaining points, then probabilistically predict the result of the left
# out point.
# This looks like Cross-Validation, but we just use it to make partitions
# of the sets
temp_cv = LeaveOneOut(num_train_points)
for temp_data_indices, temp_holdout_point_index in temp_cv:
# train a temporary classifier on remaining points
temp_clf = BOM_L0.train_using_BOM(train_data_L0[temp_data_indices],
train_target[temp_data_indices])
# probabilistic prediction on the holdout point;
# this gives probability for each of the two classes, but we only save
# the one for Class 1. (we have two indices because predict_proba
# strangely gives a 2D array output)
train_data_partial_L1[temp_holdout_point_index] = \
temp_clf.predict_proba(
train_data_L0[temp_holdout_point_index]
)[0, 1]
# train classifier on the full training set to use for converting test set
# to Level 1
temp_clf = BOM_L0.train_using_BOM(train_data_L0, train_target)
if verbose:
print("The classifier chosen for the converting this set's test data "
"to Level 1 was: " + str(temp_clf))
# probabilistic prediction for all points of test set, again only keeping
# probability for Class 1.
test_data_partial_L1 = temp_clf.predict_proba(test_data_L0)[:, 1]
return train_data_partial_L1, test_data_partial_L1
class BucketOfModels:
"""This Class implement Bucket of Models. You may specify the following
variables upon creation:
>clf_list: a list of the names of classifiers that should be considered
during the Bucket of Models.
Possible names: 'all', 'svm_rbf', 'svm_linear', 'svm_poly',
'svm_sigmoid', 'random_forest'
>iterations: How many times the parameter grid should be redrawn.
(You can just leave it at the default of 2)
>cv: Cross-validation to be used in determining best parameters
(if this argument not passed, then StratifiedKFold CV is used with
number of folds set to par_n_folds)
>par_n_folds: Number of folds for StratifiedKFold CV. This argument
only considered if the cv argument is not passed
(or cv=None is passed).
>probability: Set to True if you want a probabilistic classifier
returned."""
# We just initialize the object with all these parameters instead of having
# to pass them every time we use Bucket of Models (this was the motivation
# for making it an object and not just a method).
def __init__(self, clf_list=['svm_rbf'], iterations=2, cv=None,
par_n_folds=10, probability=False):
self.clf_list = clf_list
self.iterations = iterations
self.cv = cv
self.par_n_folds = par_n_folds
self.probability = probability
def train_using_BOM(self, train_data, train_target):
"""
RETURNS: Trained classifier (using the SKLearn classifier.
See scikit-learn.org)"""
# find the best classifier and parameters for this training set
clf = find_best_clf(train_data, train_target, clf_list=self.clf_list,
iterations=self.iterations, cv=self.cv,
par_n_folds=self.par_n_folds,
probability=self.probability)
# train this classifier on the data
clf.fit(train_data, train_target)
return clf
#############################
# End of BucketOfModels class
#############################