-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathensemble.py
394 lines (321 loc) · 13.5 KB
/
ensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from typing import List, Tuple, Callable
from sklearn.metrics import roc_auc_score
# 1. As of March 2022, I still manually rename the oofs and subs to the data/oof_and_subs folder, consider automating this
# 2. It is also important to take y_trues from oof df, because taking from raw data might not be in correct sequence, since we may shuffle the data.
# pylint: disable=missing-docstring
def stack_oofs(
oof_dfs: List[pd.DataFrame], pred_column_names: List[str]
) -> np.ndarray:
"""Stack all oof predictions horziontally.
Args:
oof_dfs (List[pd.DataFrame]): The list of oof predictions in dataframes.
pred_column_names (List[str]): The list of prediction column names.
Returns:
all_oof_preds (np.ndarray): The stacked oof predictions of shape (num_samples, num_oofs * num_pred_columns).
Example:
>>> oof_1 = pd.DataFrame([1,2,3], columns=['class_1_oof'])
>>> oof_2 = pd.DataFrame([4,5,6], columns=['class_1_oof'])
>>> all_oof_preds = stack_oofs([oof_1, oof_2], ['class_1_oof'])
>>> all_oof_preds = np.array([1, 4], [2, 5], [3, 6])
"""
num_oofs = len(oof_dfs)
num_samples = len(oof_dfs[0])
num_target_cols = len(pred_column_names)
all_oof_preds = np.zeros((num_samples, num_oofs * num_target_cols))
if num_target_cols == 1:
for index, oof_df in enumerate(oof_dfs):
all_oof_preds[:, index : index + 1] = oof_df[
pred_column_names
].values
elif num_target_cols > 1:
# Used in RANZCR where there are 11 target columns
for index, oof_df in enumerate(oof_dfs):
all_oof_preds[
:, index * num_target_cols : (index + 1) * num_target_cols
] = oof_df[pred_column_names].values
return all_oof_preds
def macro_multilabel_auc(
label, pred, num_target_cols: int = 1, multilabel: bool = False
):
"""Also works for binary AUC like Melanoma"""
if not multilabel:
return roc_auc_score(label, pred)
else:
aucs = []
for i in range(num_target_cols):
print(label[:, i])
print()
print(pred[:, i])
print(roc_auc_score(label[:, i], pred[:, i]))
aucs.append(roc_auc_score(label, pred))
return np.mean(aucs)
def compute_best_oof(
all_oof_preds: np.ndarray,
y_trues: np.ndarray,
num_oofs: int,
performance_metric: Callable,
) -> Tuple[float, int]:
"""Compute the oof score of all models using a performance metric and return the best model index and score.
Args:
all_oof_preds (np.ndarray): The stacked oof predictions of shape (num_samples, num_oofs * num_pred_columns). Taken from stack_oofs.
y_trues (np.ndarray): The true labels of shape (num_samples, num_target_cols).
num_oofs (int): The number of oof predictions.
performance_metric (Callable): The performance metric to use, this is a function.
Returns:
best_oof_metric_score (float): The best oof score.
best_model_index (int): The index of the best model.
"""
all_oof_scores = []
for k in range(num_oofs):
metric_score = performance_metric(
y_trues,
all_oof_preds[:, k],
num_target_cols=1,
multilabel=False,
)
all_oof_scores.append(metric_score)
print(f"Model {k} has OOF AUC = {metric_score}")
best_oof_metric_score, best_oof_index = np.max(all_oof_scores), np.argmax(
all_oof_scores
)
return best_oof_metric_score, best_oof_index
def calculate_best_score_over_weight_interval(
weight_interval: float,
model_i_oof: np.ndarray,
model_j_oof: np.ndarray,
y_trues: np.ndarray,
performance_metric: Callable,
running_best_score: float,
running_best_weight: float,
patience: int,
) -> Tuple[float, float]:
"""Calculate the best score over a weight interval.
Args:
weight_interval (float): _description_
model_i_oof (np.ndarray): _description_
model_j_oof (np.ndarray): _description_
y_trues (np.ndarray): _description_
performance_metric (Callable): _description_
running_best_score (float): _description_
running_best_weight (float): _description_
patience (int): _description_
Returns:
Tuple[float, float]: _description_
"""
patience_counter = 0
for weight in range(weight_interval):
temp_weight = weight / weight_interval
temp_ensemble_oof_preds = (
temp_weight * model_j_oof + (1 - temp_weight) * model_i_oof
)
temp_ensemble_oof_score = performance_metric(
y_trues,
temp_ensemble_oof_preds,
num_target_cols=1,
multilabel=False,
)
# in the first loop, if any of the blending is more than best_oof_metric_score, we will assign it to running_best_score.
if temp_ensemble_oof_score > running_best_score:
running_best_score = temp_ensemble_oof_score
running_best_weight = temp_weight
else:
patience_counter += 1
if patience_counter > patience:
break
return running_best_score, running_best_weight
def get_blended_oof(
initial_best_model_oof, best_oof_index_list, best_weights_list
):
# can be used on both oof and subs
curr_model_oof = initial_best_model_oof
for index, _ in enumerate(best_oof_index_list[1:]):
model_j_index = best_oof_index_list[index + 1]
curr_model_oof = (1 - best_weights_list[index]) * curr_model_oof + (
best_weights_list[index]
) * all_oof_preds[:, model_j_index].reshape(-1, 1)
return curr_model_oof
def return_list_of_dataframes(
path: str, is_oof: bool = True
) -> Tuple[List[pd.DataFrame], int]:
"""Return a list of dataframes from a directory of files.
The boolean is_oof is used to determine whether the list of dataframes contains oof or subs.
Args:
path (str): The path to the directory containing the files.
is_oof (bool, optional): Determine whether the list of dataframes contains oof or subs. Defaults to True.
Returns:
List[pd.DataFrame]: The list of dataframes for either oof or subs.
int: The number of files in the directory.
"""
oof_and_subs_files = os.listdir(path)
if is_oof:
oof_files_sorted = np.sort(
[f for f in oof_and_subs_files if "oof" in f]
)
return [
pd.read_csv(os.path.join(path, k)) for k in oof_files_sorted
], len(oof_files_sorted)
else:
sub_files_sorted = np.sort(
[f for f in oof_and_subs_files if "sub" in f]
)
return [
pd.read_csv(os.path.join(path, k)) for k in sub_files_sorted
], len(sub_files_sorted)
if __name__ == "__main__":
oof_and_subs_path = "./data/oof_and_subs"
# ["oof_1.csv", "sub_1.csv", "oof_2.csv", "sub_2.csv"]
oof_and_subs_files = os.listdir(oof_and_subs_path)
# ["oof_1.csv", "oof_2.csv", "sub_1.csv", "sub_2.csv"] sorted
oof_files_sorted = np.sort([f for f in oof_and_subs_files if "oof" in f])
# [oof_1_df, oof_2_df, sub_1_df, sub_2_df] in dataframe
oof_dfs_list = [
pd.read_csv(os.path.join(oof_and_subs_path, k))
for k in oof_files_sorted
]
num_oofs = len(oof_dfs_list)
# in my oof files, I also saved the corresponding y_trues, we thus take the first oof_df and use it to get the y_trues, assuming they are the same for all oof files.
# note of caution, if you use different resampling methods, you will need to change the y_trues accordingly.
y_trues_df = oof_dfs_list[0][["oof_trues"]]
y_trues = y_trues_df.values
print(f"We have {len(oof_files_sorted)} oof files.\n")
target_cols = ["oof_trues"]
pred_cols = ["class_1_oof"]
num_target_cols = len(target_cols)
all_oof_preds = stack_oofs(
oof_dfs=oof_dfs_list, pred_column_names=pred_cols
)
print(
f"all_oof_preds shape: {all_oof_preds.shape}\nThis variable is global and holds all oof predictions stacked horizontally.\n"
)
best_oof_metric_score, best_oof_index = compute_best_oof(
all_oof_preds=all_oof_preds,
y_trues=y_trues,
num_oofs=num_oofs,
performance_metric=macro_multilabel_auc,
)
print(
f"\n### Computing Best OOF scores among all models ###\nThe best OOF AUC score is {best_oof_metric_score} and the best model index is {best_oof_index} corresponding to the oof file {oof_files_sorted[best_oof_index]}"
)
weight_interval = 1000 # 200
patience = 20 # 10
min_increase = 0.0003 # 0.00003
print(
f"\n### HyperParameters ###\nweight_interval = {weight_interval}\npatience = {patience}\nmin_increase = {min_increase}\n"
)
# keep track of oof index that are blended
best_oof_index_list = [best_oof_index]
best_weights_list = []
print(f"Current Tracked Model List: {best_oof_index_list}")
print(f"Current Weights List: {best_weights_list}")
counter = 0
# Initially, this curr_model_oof is the single best model that we got above from [oof_1, oof_2,...]
initial_best_model_oof = all_oof_preds[:, best_oof_index].reshape(-1, 1)
old_best_score = best_oof_metric_score
model_i_best_score, model_i_index, model_i_weights = 0, 0, 0
print(
"Denote model i as the current model, and model j as the model that we are blending with."
)
for outer_oof_index in range(num_oofs):
# basically in the first loop, we already know the current model's oof and we assign it by subsetting the all_oof_preds with the best oof index.
curr_model_oof = initial_best_model_oof
if counter > 0:
curr_model_oof = get_blended_oof(
initial_best_model_oof, best_oof_index_list, best_weights_list
)
print(curr_model_oof)
for inner_oof_index in range(num_oofs):
# If we have [oof_1, oof_2] and best_oof_index = 1 (oof_2), then we do not need to blend oof_2 and itself.
if inner_oof_index in best_oof_index_list:
continue
# in the first loop, our running_best_score is the best_oof_metric_score
# also our old_best_score is the best_oof_metric_score in the first loop
(running_best_score, running_best_weight, patience_counter,) = (
0,
0,
0,
)
# what we are doing here is to find the best oof score among all models that we have not blended yet.
# for example, if we have [oof_1, oof_2, oof_3], and we know oof_2 is our initial_best_model_oof,
# then we need to blend oof_2 with oof_1, then oof_2 with oof_3 to find out which of them yields the best overall oof when blended.
(
running_best_score,
running_best_weight,
) = calculate_best_score_over_weight_interval(
weight_interval,
curr_model_oof,
all_oof_preds[:, inner_oof_index].reshape(-1, 1),
y_trues,
macro_multilabel_auc,
running_best_score,
running_best_weight,
patience,
)
if running_best_score > model_i_best_score:
model_i_index = inner_oof_index
model_i_best_score = running_best_score
model_i_weights = running_best_weight
increment = model_i_best_score - old_best_score
if increment <= min_increase:
print("Increment is too small, stop blending")
break
# DISPLAY RESULTS
print()
print(
"Ensemble AUC = %.4f after adding model %i with weight %.3f. Increase of %.4f"
% (
model_i_best_score,
model_i_index,
model_i_weights,
increment,
)
)
print()
old_best_score = model_i_best_score
best_oof_index_list.append(model_i_index)
best_weights_list.append(model_i_weights)
print(f"Current Tracked Model List: {best_oof_index_list}")
print(f"Current Weights List: {best_weights_list}")
print(f"Current Best Score: {model_i_best_score}")
counter += 1
plt.hist(curr_model_oof, bins=100)
plt.title("Ensemble OOF predictions")
plt.show()
# apply on submission
sub_files_sorted = np.sort([f for f in oof_and_subs_files if "sub" in f])
sub_dfs_list = [
pd.read_csv(os.path.join(oof_and_subs_path, k))
for k in sub_files_sorted
]
print(f"\nWe have {len(sub_files_sorted)} submission files...")
print()
print(sub_files_sorted)
y = np.zeros((len(sub_dfs_list[0]), len(sub_files_sorted) * len(pred_cols)))
print(y.shape)
for k in range(len(sub_files_sorted)):
y[
:, int(k * len(pred_cols)) : int((k + 1) * len(pred_cols))
] = sub_dfs_list[k]["target"].values.reshape(-1, 1)
print(y)
md2 = y[
:,
int(best_oof_index_list[0] * len(pred_cols)) : int(
(best_oof_index_list[0] + 1) * len(pred_cols)
),
]
print(md2)
for i, k in enumerate(best_oof_index_list[1:]):
md2 = (
best_weights_list[i]
* y[:, int(k * len(pred_cols)) : int((k + 1) * len(pred_cols))]
+ (1 - best_weights_list[i]) * md2
)
plt.hist(md2, bins=100)
plt.show()
df = sub_dfs_list[0].copy()
df[["target"]] = md2
df.to_csv("submission.csv", index=False)
df.head()