-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasets.py
6566 lines (5222 loc) · 343 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import pickle
import datetime
import gc
import subprocess
import time
import io
import csv
import kaggle
from math import sqrt
import math
from math import ceil
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer, OrdinalEncoder, StandardScaler, LabelEncoder, MinMaxScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, StratifiedGroupKFold
from sklearn.metrics import mean_squared_error
from utils import set_seed, get_metric, get_memory_usage, sizeof_fmt, reduce_mem_usage, merge_by_concat
from models import get_model
import xgboost as xgb
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import openfe
def get_dataset(dataset_name, toy_example=False):
if dataset_name=="santander-value-prediction-challenge":
return SantanderValueDataset(toy_example)
elif dataset_name=="mercedes-benz-greener-manufacturing":
return MercedesBenzDataset(toy_example)
elif dataset_name=="m5-forecasting-accuracy":
return M5ForecastDataset(toy_example)
elif dataset_name=="santander-customer-transaction-prediction":
return SantanderTransactionDataset(toy_example)
elif dataset_name=="ieee-fraud-detection":
return IEEEFraudDetectionDataset(toy_example)
elif dataset_name=="amazon-employee-access-challenge":
return AmazonEmployeeAccessDataset(toy_example)
elif dataset_name=="higgs-boson":
return HiggsBosonDataset(toy_example)
elif dataset_name=="santander-customer-satisfaction":
return SantanderSatisfactionDataset(toy_example)
elif dataset_name=="porto-seguro-safe-driver-prediction":
return PortoSeguroDriverDataset(toy_example)
elif dataset_name=="sberbank-russian-housing-market":
return SberbankHousingDataset(toy_example)
elif dataset_name == "walmart-recruiting-trip-type-classification":
return WalmartRecruitingTripType(toy_example)
elif dataset_name == "allstate-claims-severity":
return AllstateClaimsSeverity(toy_example)
elif dataset_name == "bnp-paribas-cardif-claims-management":
return BNPParibasCardifClaimsManagement(toy_example)
elif dataset_name == "restaurant-revenue-prediction":
return RestaurantRevenuePrediction(toy_example)
elif dataset_name == "home-credit-default-risk":
return HomeCreditDefaultRisk(toy_example)
elif dataset_name == "icr-identify-age-related-conditions":
return ICRIdentifyAgeRelatedConditions(toy_example)
elif dataset_name == "lish-moa":
return MoAPrediction(toy_example)
elif dataset_name == "zillow-prize-1":
return ZillowPrice(toy_example)
elif dataset_name == "otto-group-product-classification-challenge":
return OttoGroupProductClassification(toy_example)
elif dataset_name == "springleaf-marketing-response":
return SpringleafMarketingResponse(toy_example)
elif dataset_name == "prudential-life-insurance-assessment":
return PrudentialLifeInsuranceAssessment(toy_example)
elif dataset_name == "microsoft-malware-prediction":
return MicrosoftMalwarePrediction(toy_example)
elif dataset_name == "homesite-quote-conversion":
return HomesiteQuoteConversion(toy_example)
elif dataset_name == "predicting-red-hat-business-value":
return PredictingRedHatBusinessValue(toy_example)
elif dataset_name == "talkingdata-mobile-user-demographics":
return TalkingdataMobileUserDemographics(toy_example)
else:
raise ValueError(f"Dataset '{dataset_name}' not implemented.")
class BaseDataset:
''' All implemented datasets should inherit from this base dataset to maintain the same structure.
All new datasets should define own init methods which include the following dataset-specific instances:
- dataset_name: name of the dataset (defined by the name of the Kaggle competition)
- task_type: "regression", "binary", or "classification"
- eval_metric_name: name of evaluation metric (needs to be a metric included in utils.py)
- cat_indices: list of indices for categorical features
- y_col: name of the target column
Whenever the logic does not follow the BaseDataset, new datasets must implement the following functions:
- load_data: All steps required to obtain a single table as raw as possible
- pred_to_submission: Transform predicted test data values to correct format for submitting
- get_cv_folds: CV procedure representing the dataset-specific expert CV procedure
Ideally, new datasets would also implement dataset-specific expert preprocessing
- expert_preprocessing: All preprocessing and feature engineering steps in the pipeline of a high-ranked expert solution
'''
def __init__(self, toy_example=False):
self.toy_example = toy_example
self.dataset_name = ""
self.cat_indices = []
self.y_col = ""
self.heavy_tailed = False
self.preprocess_states = []
# experimental
self.trial_budget = 100
self.batch_size = 128
self.x_scaled = False
self.large_dataset = False
def load_data(self):
data = pd.read_csv(f'./datasets/{self.dataset_name}/raw/train.csv', engine="pyarrow")
X_test = pd.read_csv(f'./datasets/{self.dataset_name}/raw/test.csv', engine="pyarrow")
if self.toy_example:
data = data.iloc[:1000]
X_test = X_test.iloc[:1000]
y_train = data[self.y_col]
X_train = data.drop(self.y_col,axis=1)
if self.task_type== "classification":
self.target_label_enc = LabelEncoder()
y_train = pd.Series(self.target_label_enc.fit_transform(y_train),index=y_train.index, name=y_train.name)
self.num_classes = y_train.nunique()
self.X_train, self.X_test, self.y_train = X_train, X_test, y_train
def minimalistic_preprocessing(self, X_train, X_test, y_train,
scaler=None, one_hot_encode=False, use_test=True):
'''Preprocessing based on McElfresh et al. 2023
- Define categorical feature types
- Fill missing values with mean
- Optionally: scale numeric features or apply OHE to categoricals
'''
print("Apply minimalistic preprocessing")
# Encode binary cat features as numeric
for col in X_train.columns[X_train.nunique()==2]:
if X_train[col].dtype in [str, "O", "category", "object"]:
le = LabelEncoder()
mode = X_train[col].mode()[0]
X_train[col] = le.fit_transform(X_train[col])
if len(X_test[col].unique())==2:
X_test[col] = le.transform(X_test[col])
else:
X_test[col] = X_test[col].fillna(mode)
X_test[col] = le.transform(X_test[col])
# Define categorical feature types
self.cat_indices += list(np.where(X_train.dtypes=="O")[0])
self.cat_indices += list(np.where(X_train.dtypes=="object")[0])
self.cat_indices += list(np.where(X_train.dtypes=="category")[0])
self.cat_indices = np.unique(self.cat_indices).tolist()
for num, col in list(zip(self.cat_indices,X_train.columns[self.cat_indices])):
# Encode binary categorical features
if X_train[col].nunique()==2:
value_1 = X_train[col].dropna().unique()[0]
X_train[col] = (X_train[col]==value_1).astype(float)
X_test[col] = (X_test[col]==value_1).astype(float)
self.cat_indices.remove(num)
else:
# Note: The category dtype needs to entail all train categories when applying .astype("category") on test data
dtype = pd.CategoricalDtype(categories=list(X_train[col].astype(str).fillna("nan").unique()))
X_train[col] = X_train[col].astype(str).fillna("nan").astype(dtype)
X_test[col] = X_test[col].astype(str).fillna("nan").astype(dtype)
cont_indices = np.array([i for i in range(X_train.shape[1]) if i not in self.cat_indices])
cont_col_names = X_train.iloc[:,cont_indices].columns
X_concat = pd.concat([X_train, X_test])
# Fill missing values of continuous columns with mean
if X_train.isna().sum().sum()>0:
if use_test:
X_test[cont_col_names] = X_test[cont_col_names].fillna(X_concat[cont_col_names].mean())
X_train[cont_col_names] = X_train[cont_col_names].fillna(X_concat[cont_col_names].mean())
else:
X_test[cont_col_names] = X_test[cont_col_names].fillna(X_train[cont_col_names].mean())
X_train[cont_col_names] = X_train[cont_col_names].fillna(X_train[cont_col_names].mean())
# if scaler is not None:
# X_train[cont_col_names] = scaler_function.fit_transform(X_train[cont_col_names])
# X_test[cont_col_names] = scaler_function.transform(X_test[cont_col_names])
# if one_hot_encode:
# ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
# new_x1 = ohe.fit_transform(X_train[:, self.cat_indices])
# X_train = np.concatenate([new_x1, X_train[:, num_mask]], axis=1)
# new_x1_test = ohe.transform(X_test[:, self.cat_indices])
# X_test = np.concatenate([new_x1_test, X_test[:, num_mask]], axis=1)
# self.cat_indices = []
# Drop constant columns
# drop_cols = X_train.columns[X_train.nunique()==X_train.shape[0]].values.tolist()
drop_cols = X_train.columns[X_train.nunique()==1].values.tolist()
if len(drop_cols)>0:
print(f"Drop {len(drop_cols)} constant/unique features")
original_categorical_names = X_train.columns[self.cat_indices]
X_train.drop(drop_cols,axis=1,inplace=True)
X_test.drop(drop_cols,axis=1,inplace=True)
self.cat_indices = [np.where(X_train.columns==i)[0][0] for i in original_categorical_names if i in X_train.columns]
if self.heavy_tailed: # Todo: Might move to minimalistic
y_train = np.log1p(y_train)
self.preprocess_states.append("minimalistic")
self.X_train, self.X_test, self.y_train = X_train, X_test, y_train
def minimalistic_postprocessing(self, X_train, y, **kwargs):
if self.task_type=="regression":
if self.heavy_tailed:
y = np.expm1(y)
return y
def openfe_preprocessing(self, X_train, X_test, y_train, overwrite_existing=False):
if not os.path.exists(f"./datasets/{self.dataset_name}/processed/X_train_openfe.pickle") or overwrite_existing:
print("Apply OpenFE preprocessing")
import warnings
warnings.filterwarnings("ignore")
task = "regression" if self.task_type == "regression" else "classification"
cont_indices = np.array([i for i in range(X_train.shape[1]) if i not in self.cat_indices])
cont_col_names = X_train.iloc[:,cont_indices].columns.values.tolist()
if len(self.cat_indices)>0:
cat_col_names = X_train.columns[self.cat_indices]
else:
cat_col_names = None
candidate_features_list = openfe.get_candidate_features(numerical_features=cont_col_names, categorical_features=cat_col_names, order=1)
ofe = openfe.OpenFE()
features = ofe.fit(data=X_train, label=y_train, n_jobs=os.cpu_count(), task=task, n_data_blocks=8,
candidate_features_list=candidate_features_list,
stage2_params={"verbose": -1},
verbose=True, tmp_save_path=f'./openfe_tmp_data_{self.dataset_name}.feather')
X_train_new, X_test_new = openfe.transform(X_train, X_test, features, n_jobs=os.cpu_count())
is_combined = [f.name=='Combine' for f in features]
if sum(is_combined)>0:
self.cat_indices += list(np.where([f.name=='Combine' for f in features])[0]+X_train.shape[1])
self.X_train, self.X_test = X_train_new, X_test_new
os.makedirs(f'./datasets/{self.dataset_name}/processed/', exist_ok=True)
pickle.dump(self.X_train, open(f'./datasets/{self.dataset_name}/processed/X_train_openfe.pickle', 'wb'))
pickle.dump(self.X_test, open(f'./datasets/{self.dataset_name}/processed/X_test_openfe.pickle', 'wb'))
pickle.dump(self.cat_indices, open(f'./datasets/{self.dataset_name}/processed/cat_indices_openfe.pickle', 'wb'))
else:
print(f"Load existing openFE-preprocessed data")
X_train = pickle.load(open(f'./datasets/{self.dataset_name}/processed/X_train_openfe.pickle', 'rb'))
X_test = pickle.load(open(f'./datasets/{self.dataset_name}/processed/X_test_openfe.pickle', 'rb'))
self.cat_indices = pickle.load(open(f'./datasets/{self.dataset_name}/processed/cat_indices_openfe.pickle', 'rb'))
self.X_train, self.X_test = X_train, X_test
self.preprocess_states.append("openfe")
def automated_feature_engineering(self, X_train, X_test, y_train):
'''Preprocessing with openFE'''
self.preprocess_states.append("autoFE")
self.X_train, self.X_test, self.y_train = X_train, X_test, y_train
def expert_preprocessing(self, X_train, X_test, y_train, overwrite_existing=False, **kwargs):
print("Expert preprocessing not implemented yet")
self.X_train, self.X_test, self.y_train = X_train, X_test, y_train
def neuralnet_preprocessing(self, X_train, X_test, y_train, use_test=True):
if self.task_type=="regression":
# if self.heavy_tailed: # Todo: Might move to minimalistic
# y_train = np.log1p(y_train)
self.target_scaler = StandardScaler()
y_train = pd.Series(self.target_scaler.fit_transform(y_train.values.reshape(-1,1)).ravel(),
name=self.y_col, index = X_train.index)
# Drop constant columns
# drop_cols = X_train.columns[X_train.nunique()==X_train.shape[0]].values.tolist()
drop_cols = X_train.columns[X_train.nunique()==1].values.tolist()
if len(drop_cols)>0:
print(f"Drop {len(drop_cols)} constant/unique features")
original_categorical_names = X_train.columns[self.cat_indices]
X_train.drop(drop_cols,axis=1,inplace=True)
X_test.drop(drop_cols,axis=1,inplace=True)
self.cat_indices = [np.where(X_train.columns==i)[0][0] for i in original_categorical_names if i in X_train.columns]
# Drop constant nan cols
if len(self.cat_indices)==0: #
nan_cols = X_train.columns[X_train.isna().sum()==X_train.shape[0]].values.tolist()
if len(nan_cols)>0:
print(f"Drop {len(nan_cols)} all-nan features")
# original_categorical_names = X_train.columns[self.cat_indices]
X_train.drop(nan_cols,axis=1,inplace=True)
X_test.drop(nan_cols,axis=1,inplace=True)
# self.cat_indices = [np.where(X_train.columns==i)[0][0] for i in original_categorical_names if i in X_train.columns]
X_concat = pd.concat([X_train, X_test])
cont_indices = np.array([i for i in range(X_train.shape[1]) if i not in self.cat_indices])
cont_col_names = X_train.iloc[:,cont_indices].columns
X_concat[cont_col_names] = X_concat[cont_col_names].astype(np.float32)
X_train[cont_col_names] = X_train[cont_col_names].astype(np.float32)
X_test[cont_col_names] = X_test[cont_col_names].astype(np.float32)
# Apply ordinal encoding to all categorical features
if len(self.cat_indices)>0:
cat_col_names = X_train.iloc[:,self.cat_indices].columns
for col in cat_col_names:
enc = OrdinalEncoder(handle_unknown="use_encoded_value",
unknown_value=X_train[col].nunique(),
encoded_missing_value=X_train[col].nunique()
)
X_train[col] = enc.fit_transform(X_train[col].values.reshape(-1,1)).astype(int)
X_test[col] = enc.transform(X_test[col].values.reshape(-1,1)).astype(int)
# Fill missing values of continuous columns with mean
if X_train.isna().sum().sum()>0 or X_test.isna().sum().sum()>0:
if use_test:
X_test[cont_col_names] = X_test[cont_col_names].fillna(X_concat[cont_col_names].mean())
X_train[cont_col_names] = X_train[cont_col_names].fillna(X_concat[cont_col_names].mean())
X_concat[cont_col_names] = X_concat[cont_col_names].fillna(X_concat[cont_col_names].mean())
else:
X_test[cont_col_names] = X_test[cont_col_names].fillna(X_train[cont_col_names].mean())
X_train[cont_col_names] = X_train[cont_col_names].fillna(X_train[cont_col_names].mean())
if X_train.shape[1]!=len(self.cat_indices):
if not self.x_scaled:
# self.x_scaler = QuantileTransformer(
# n_quantiles= 1000
# )
# X_train[cont_col_names] = self.x_scaler.fit_transform(X_train[cont_col_names])
# X_test[cont_col_names] = self.x_scaler.transform(X_test[cont_col_names])
quantile_noise = 1e-4
if use_test:
quantile_use = np.copy(X_concat[cont_col_names].values).astype(np.float64)
else:
quantile_use = np.copy(X_train[cont_col_names].values).astype(np.float64)
stds = np.std(quantile_use, axis=0, keepdims=True)
noise_std = quantile_noise / np.maximum(stds, quantile_noise)
quantile_use += noise_std * np.random.randn(*quantile_use.shape)
if use_test:
quantile_use = pd.DataFrame(quantile_use, columns=cont_col_names, index=X_concat.index)
else:
quantile_use = pd.DataFrame(quantile_use, columns=cont_col_names, index=X_train.index)
self.x_scaler = QuantileTransformer(
n_quantiles=min(quantile_use.shape[0], 1000),
output_distribution='normal')
self.x_scaler.fit(quantile_use.values.astype(np.float64))
X_train[cont_col_names] = self.x_scaler.transform(X_train[cont_col_names].values.astype(np.float64))
X_test[cont_col_names] = self.x_scaler.transform(X_test[cont_col_names].values.astype(np.float64))
self.x_scaled = True
self.preprocess_states.append("neuralnet")
self.X_train, self.X_test, self.y_train = X_train, X_test, y_train
def expert_postprocessing(self, X_train, y, **kwargs):
return y
def neuralnet_postprocessing(self, X, y):
if self.task_type=="regression":
if isinstance(y, pd.Series):
y = y.values.reshape(-1,1)
y = pd.Series(self.target_scaler.inverse_transform(y.reshape(-1,1)).ravel(),
name=self.y_col, index = X.index)
# if self.heavy_tailed:
# y = np.expm1(y)
return y
def get_cv_folds(self, X_train, y_train, seed=42):
### !! Currently not original implemented - original solution used 30-fold CV - but also dicusses 5-fold
ss = KFold(n_splits=10, random_state=seed, shuffle=True)
folds = []
for num, (train,test) in enumerate(ss.split(y_train.copy(), y_train.copy())):
folds.append([train, test])
return folds
def pred_to_submission(self, y_pred):
try:
submission = pd.read_csv(f"datasets/{self.dataset_name}/raw/sample_submission.csv", engine="pyarrow")
except:
submission = pd.read_csv(f"datasets/{self.dataset_name}/raw/sampleSubmission.csv", engine="pyarrow")
if self.toy_example:
submission = submission.iloc[:1000]
submission[self.y_col] = y_pred
return submission
def submit_data(self, file_name):
'''
It might be important to make the users aware of the following warning:
'Warning: Your Kaggle API key is readable by other users on this system! To fix this', " you can run 'chmod 600 /home/atschalz/.kaggle/kaggle.json'
'''
# submit file to kaggle for evaluation
os.system(f"kaggle competitions submit -c {self.dataset_name} -f {file_name} -m 'submitted from python script'")
# get all submissions and their scores the user has made
# wait for 5 seconds to ensure that the submission is processed (Todo: Might better be captured with a while loop)
processed = False
cnt = 0
while not processed and cnt < 10:
time.sleep(5)
try:
command = f"kaggle competitions submissions --csv {self.dataset_name}"
shell_output = subprocess.check_output(command, shell=True, text=True)
# parse the shell output to a dataframe
csv_file = io.StringIO(shell_output)
reader = csv.reader(csv_file)
data = list(reader)
while data[0][0][:7]=="Warning":
print(data[0])
data = data[1:]
if data[0]=="Error":
print(data[0])
exit()
# submissions_df = pd.DataFrame(data[1:], columns=data[0])
submissions_df = pd.DataFrame(kaggle.api.competitions_submissions_list(self.dataset_name))
public_score = float(submissions_df.loc[submissions_df.fileNameNullable==file_name.split("/")[-1],"publicScoreNullable"].iloc[0])
private_score = float(submissions_df.loc[submissions_df.fileNameNullable==file_name.split("/")[-1],"privateScoreNullable"].iloc[0])
processed = True
except:
print(f"{cnt} Waited for 5 seconds, but submission was not processed correctly")
cnt += 1
# get the public and private leaderboard to compute the rank of the submission
leaderboard_df =pd.read_csv(f"./datasets/{self.dataset_name}/leaderboard.csv")
# compute the public rank and percentile of the submission
lb_with_own = np.array(sorted(list(leaderboard_df.PublicScore.astype(float))+[public_score],reverse=True))
if self.eval_metric_direction=="minimize":
lb_with_own = lb_with_own[::-1]
public_rank = np.where(lb_with_own==public_score)[0][0]+1
public_percentile = (public_rank / len(leaderboard_df))
# compute the private rank and percentile of the submission
lb_with_own = np.array(sorted(list(leaderboard_df.PrivateScore.astype(float))+[private_score],reverse=True))
if self.eval_metric_direction=="minimize":
lb_with_own = lb_with_own[::-1]
private_rank = np.where(lb_with_own==private_score)[0][0]+1
private_percentile = (private_rank / len(leaderboard_df))
return public_score, private_score, public_rank, public_percentile, private_rank, private_percentile
##########################################################
##########################################################
##########################################################
class MercedesBenzDataset(BaseDataset):
def __init__(self, toy_example=False):
super().__init__(toy_example)
self.dataset_name = "mercedes-benz-greener-manufacturing"
############## 0. Define Data Parameters ##############
self.task_type = "regression" # "binary", "classification"
self.eval_metric_name = "r2"
self.eval_metric, self.eval_metric_direction = get_metric(self.eval_metric_name)
self.cat_indices = [1,2,3,4,5,6,7,8]
self.y_col = "y"
self.large_dataset = False
def expert_preprocessing(self, X_train, X_test, y_train, overwrite_existing=False, use_test=True, cat_method=None, **kwargs):
'''
Solution implemented based on the descriptions in https://www.kaggle.com/competitions/mercedes-benz-greener-manufacturing/discussion/37700
1. Preprocessing:
- instead of throwing out outliers, clipped all y's at 155. 155 was selected from a visual inspection of y's distribution
2. Feature Engineering
- 'ID', and 'X0' as a single factorized array (noticed that I would have scored .55571 (private LB) if I had not included 'XO' as a single factorized array)
- X0: 15 of the 47 unique categories
- X1: 6 of the 27 unique categories
- X2: 13 of the 44 unique categories
- X3: 2 of the 7 unique categories
- X4: no categories
- X5: 9 of the 29 unique categories
- X6: 4 of the 12 unique categories
- X8: 5 of the 25 unique categories
- X10 - X385: 78 of the 357 binary features
3. CV: simple 5-fold KFold with shuffle set to true and the same random seed
- assumed CV scores within .001 of each other to be effectively equal.
4. Ensemble: stacked ensemble including GradientBoostingRegressor, RandomForestRegressor, and SVR
'''
if use_test and not self.toy_example:
dataset_version = "expert_test"
elif not use_test and not self.toy_example:
dataset_version = "expert_notest"
elif use_test and self.toy_example:
dataset_version = "expert_test_toy"
elif not use_test and not self.toy_example:
dataset_version = "expert_notest_toy"
if cat_method is not None:
dataset_version = dataset_version+"_"+cat_method
if not os.path.exists(f"./datasets/{self.dataset_name}/processed/X_train_{dataset_version}.pickle") or overwrite_existing:
print(f"Apply expert preprocessing")
### Create interaction features
X_train["X314-315"] = X_train["X314"]+X_train["X315"]
X_train["X118-314-315"] = X_train["X118"]+X_train["X314"]+X_train["X315"]
X_train["X118-314-315-levels110"] = np.logical_and(np.logical_and(X_train["X118"]==1,X_train["X314"]==1), X_train["X315"]==0)*1
X_train["X47-48"] = X_train["X47"]+X_train["X48"]
X_test["X314-315"] = X_test["X314"]+X_test["X315"]
X_test["X118-314-315"] = X_test["X118"]+X_test["X314"]+X_test["X315"]
X_test["X118-314-315-levels110"] = np.logical_and(np.logical_and(X_test["X118"]==1,X_test["X314"]==1), X_test["X315"]==0)*1
X_test["X47-48"] = X_test["X47"]+X_test["X48"]
### Create a feature based on considerations of subprocesses in data creation
X_train["sum_122_128"] = X_train[["X122","X123","X124","X125","X126","X127","X128"]].sum(axis=1)
X_test["sum_122_128"] = X_test[["X122","X123","X124","X125","X126","X127","X128"]].sum(axis=1)
### Use only the generated features, X0 and the six features found to be most important
### Use index as feature helps
use_features = ["ID", "X314-315","X118-314-315","X118-314-315-levels110", "sum_122_128", "X47-48", "X0", "X314", "X279", "X232", "X261", "X29", "X127"]
X_train = X_train[use_features]
X_test = X_test[use_features]
### One-hot-encode X0
if cat_method == "model":
self.cat_indices = list(np.where(X_train.columns=="X0")[0])
else:
ohe = OneHotEncoder(handle_unknown='ignore')
cat = "X0"
x = X_train[cat]
ohe.fit(x.values.reshape(-1,1))
X_train_ohe = pd.DataFrame(ohe.transform(x.values.reshape(-1,1)).toarray(),
index = X_train.index,
columns=cat+"_"+ohe.categories_[0])
X_train = pd.concat([X_train,X_train_ohe],axis=1)
X_test_ohe = pd.DataFrame(ohe.transform(X_test[[cat]]).toarray(),
index = X_test.index,
columns=cat+"_"+ohe.categories_[0])
X_test = pd.concat([X_test,X_test_ohe],axis=1)
X_train.drop(cat,inplace=True,axis=1)
X_test.drop(cat,inplace=True,axis=1)
self.cat_indices = []
# Transform continuous to floats
# cont_indices = np.array([i for i in range(X_train.shape[1]) if i not in self.cat_indices])
# cont_col_names = X_train.iloc[:,cont_indices].columns
# X_train.loc[:,cont_col_names] = X_train[cont_col_names].astype(float)
### One-hot-encode categorical features
# cat_features = [f"X{i}" for i in range(9) if i!=7]
# bin_features = list(set(X_train.columns)-set(cat_features))
# for cat in cat_features:
# ohe = OneHotEncoder(handle_unknown='ignore')
# ohe.fit(X_train[[cat]])
# X_train_ohe = pd.DataFrame(ohe.transform(X_train[[cat]]).toarray(),
# index = X_train.index,
# columns=cat+"_"+ohe.categories_[0])
# X_train = pd.concat([X_train,X_train_ohe],axis=1)
# X_train.drop(cat,inplace=True,axis=1)
# X_test_ohe = pd.DataFrame(ohe.transform(X_test[[cat]]).toarray(),
# index = X_test.index,
# columns=cat+"_"+ohe.categories_[0])
# X_test = pd.concat([X_test,X_test_ohe],axis=1)
# X_test.drop(cat,inplace=True,axis=1)
os.makedirs(f'./datasets/{self.dataset_name}/processed/', exist_ok=True)
pickle.dump(X_train, open(f'./datasets/{self.dataset_name}/processed/X_train_{dataset_version}.pickle', 'wb'))
pickle.dump(y_train, open(f'./datasets/{self.dataset_name}/processed/y_train_{dataset_version}.pickle', 'wb'))
pickle.dump(X_test, open(f'./datasets/{self.dataset_name}/processed/X_test_{dataset_version}.pickle', 'wb'))
pickle.dump(self.cat_indices, open(f'./datasets/{self.dataset_name}/processed/cat_indices_{dataset_version}.pickle', 'wb'))
### Clip target to 155 (from second place solution)
# y_train[y_train>155] = 155.
else:
print(f"Load existing expert-preprocessed data")
X_train = pickle.load(open(f'./datasets/{self.dataset_name}/processed/X_train_{dataset_version}.pickle', 'rb'))
y_train = pickle.load(open(f'./datasets/{self.dataset_name}/processed/y_train_{dataset_version}.pickle', 'rb'))
X_test = pickle.load(open(f'./datasets/{self.dataset_name}/processed/X_test_{dataset_version}.pickle', 'rb'))
try:
self.cat_indices = pickle.load(open(f'./datasets/{self.dataset_name}/processed/cat_indices_{dataset_version}.pickle', 'rb'))
except:
self.cat_indices = []
self.preprocess_states.append("expert")
self.X_train, self.X_test, self.y_train = X_train, X_test, y_train
def openfe_preprocessing(self, X_train, X_test, y_train, overwrite_existing=False):
if not os.path.exists(f"./datasets/{self.dataset_name}/processed/X_train_openfe.pickle") or overwrite_existing:
print("Apply OpenFE preprocessing")
import warnings
warnings.filterwarnings("ignore")
task = "regression" if self.task_type == "regression" else "classification"
numeric_features = ["X314","X315","X118", "X122", "X123", "X124", "X125", "X126", "X127","X128", "X47","X48", "X279", "X232", "X261", "X29"]
candidate_features_list = openfe.get_candidate_features(numerical_features=numeric_features, categorical_features=["X0"], order=1)
ofe = openfe.OpenFE()
features = ofe.fit(data=X_train, label=y_train, n_jobs=os.cpu_count(), task=task, n_data_blocks=8,
candidate_features_list=candidate_features_list,
stage2_params={"verbose": -1},
verbose=True, tmp_save_path=f'./openfe_tmp_data_{self.dataset_name}.feather')
X_train_new, X_test_new = openfe.transform(X_train, X_test, features, n_jobs=os.cpu_count())
is_combined = [f.name=='Combine' for f in features]
if sum(is_combined)>0:
self.cat_indices += list(np.where([f.name=='Combine' for f in features])[0]+X_train.shape[1])
self.X_train, self.X_test = X_train_new, X_test_new
os.makedirs(f'./datasets/{self.dataset_name}/processed/', exist_ok=True)
pickle.dump(self.X_train, open(f'./datasets/{self.dataset_name}/processed/X_train_openfe.pickle', 'wb'))
pickle.dump(self.X_test, open(f'./datasets/{self.dataset_name}/processed/X_test_openfe.pickle', 'wb'))
pickle.dump(self.cat_indices, open(f'./datasets/{self.dataset_name}/processed/cat_indices_openfe.pickle', 'wb'))
else:
print(f"Load existing openFE-preprocessed data")
X_train = pickle.load(open(f'./datasets/{self.dataset_name}/processed/X_train_openfe.pickle', 'rb'))
X_test = pickle.load(open(f'./datasets/{self.dataset_name}/processed/X_test_openfe.pickle', 'rb'))
self.cat_indices = pickle.load(open(f'./datasets/{self.dataset_name}/processed/cat_indices_openfe.pickle', 'rb'))
self.X_train, self.X_test = X_train, X_test
self.preprocess_states.append("openfe")
##########################################################
##########################################################
##########################################################
# TODO: Correctly implement m5-forecasting-accuracy
class M5ForecastDataset(BaseDataset):
def __init__(self, toy_example=False):
super().__init__(toy_example)
self.dataset_name = "m5-forecasting-accuracy"
############## 0. Define Data Parameters ##############
self.task_type = "regression"
self.eval_metric_name = "rmsse"
self.eval_metric, self.eval_metric_direction = get_metric(self.eval_metric_name)
self.cat_indices = []
self.y_col = "sales"
self.large_dataset = False
def load_data(self):
'''
1. Load the three data tables 'sales_train_evaluation', 'prices_df', and 'releases_df'
2. Unpivote the main table to a vertical grid - each day becomes a separate row instead of column
3. Add placeholder test predictions to the grid to be able to make predictions
4. Add information from prices_df
- group the prices_df by store_id and item_id to find the week when an item was first sold in a specific store (wm_yr_wk)
- The result of that is a temporary dataframe release_df
- The release_df is merged with the grid_df
- all the rows which report the sale of the item in a store before it was first released in that store are deleted as they are 0
5. Create 10 new features based on the sell price
6. Merge 9 of 14 of the columns of the *calender_df* with the *grid_df*
7. Convert columns to boolean or datetime where necessary
8. Create datetime features
'''
### Load data
train_df = pd.read_csv(f'./datasets/{self.dataset_name}/raw/sales_train_evaluation.csv', engine="pyarrow")
prices_df = pd.read_csv(f'./datasets/{self.dataset_name}/raw/sell_prices.csv', engine="pyarrow")
calendar_df = pd.read_csv(f'./datasets/{self.dataset_name}/raw/calendar.csv', engine="pyarrow")
TARGET = 'sales' # Our main target
END_TRAIN = 1941 # Last day in train set
MAIN_INDEX = ['id','d'] # We can identify item by these columns
### Unpivote data / transform to vertical view
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
if not os.path.exists(f'./datasets/{self.dataset_name}/processed/grid_part_1.pkl'):
grid_df = pd.melt(train_df,
id_vars = index_columns,
var_name = 'd',
value_name = TARGET)
### create a temporary dataframe for each day and merge them into one temporary dataframe
# To be able to make predictions
# we need to add "test set" to our grid
add_grid = pd.DataFrame()
for i in range(1,29):
temp_df = train_df[index_columns]
temp_df = temp_df.drop_duplicates()
temp_df['d'] = 'd_'+ str(END_TRAIN+i)
temp_df[TARGET] = np.nan
add_grid = pd.concat([add_grid,temp_df])
grid_df = pd.concat([grid_df,add_grid])
grid_df = grid_df.reset_index(drop=True)
# Remove some temoprary DFs
del temp_df, add_grid
# We will not need original train_df
# anymore and can remove it
del train_df
# You don't have to use df = df construction
# you can use inplace=True instead.
# like this
# grid_df.reset_index(drop=True, inplace=True)
# Let's check our memory usage
print("{:>20}: {:>8}".format('Original grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))
# We can free some memory
# by converting "strings" to categorical
# it will not affect merging and
# we will not lose any valuable data
for col in index_columns:
grid_df[col] = grid_df[col].astype('category')
# Let's check again memory usage
print("{:>20}: {:>8}".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))
### group the *prices_df* by *store_id* and *item_id* to find the week when an item was first sold in a specific store (*wm_yr_wk*)
########################### Product Release date #################################################################################
print('Release week')
# It seems that leadings zero values
# in each train_df item row
# are not real 0 sales but mean
# absence for the item in the store
# we can safe some memory by removing
# such zeros
# Prices are set by week
# so it we will have not very accurate release week
release_df = prices_df.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
release_df.columns = ['store_id','item_id','release']
# Now we can merge release_df
grid_df = merge_by_concat(grid_df, release_df, ['store_id','item_id'])
del release_df
# We want to remove some "zeros" rows
# from grid_df
# to do it we need wm_yr_wk column
# let's merge partly calendar_df to have it
grid_df = merge_by_concat(grid_df, calendar_df[['wm_yr_wk','d']], ['d'])
# Now we can cutoff some rows
# and safe memory
grid_df = grid_df[grid_df['wm_yr_wk']>=grid_df['release']]
grid_df = grid_df.reset_index(drop=True)
# Let's check our memory usage
print("{:>20}: {:>8}".format('Original grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))
# Should we keep release week
# as one of the features?
# Only good CV can give the answer.
# Let's minify the release values.
# Min transformation will not help here
# as int16 -> Integer (-32768 to 32767)
# and our grid_df['release'].max() serves for int16
# but we have have an idea how to transform
# other columns in case we will need it
grid_df['release'] = grid_df['release'] - grid_df['release'].min()
grid_df['release'] = grid_df['release'].astype(np.int16)
# Let's check again memory usage
print("{:>20}: {:>8}".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))
########################### Save part 1
#################################################################################
print('Save Part 1')
# We have our BASE grid ready
# and can save it as pickle file
# for future use (model training)
grid_df.to_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_1.pkl')
else:
grid_df = pd.read_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_1.pkl')
print('Size:', grid_df.shape)
if not os.path.exists(f'./datasets/{self.dataset_name}/processed/grid_part_2.pkl'):
### Create ten new features absed on the sell price
print('Prices')
# We can do some basic aggregations
prices_df['price_max'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('max')
prices_df['price_min'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('min')
prices_df['price_std'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('std')
prices_df['price_mean'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('mean')
# and do price normalization (min/max scaling)
prices_df['price_norm'] = prices_df['sell_price']/prices_df['price_max']
# Some items are can be inflation dependent
# and some items are very "stable"
prices_df['price_nunique'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
prices_df['item_nunique'] = prices_df.groupby(['store_id','sell_price'])['item_id'].transform('nunique')
# I would like some "rolling" aggregations
# but would like months and years as "window"
calendar_prices = calendar_df[['wm_yr_wk','month','year']]
calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk']) # distinct(.keep_all = True)
prices_df = prices_df.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
del calendar_prices
# Now we can add price "momentum" (some sort of)
# Shifted by week
# by month mean
# by year mean
prices_df['price_momentum'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
prices_df['price_momentum_m'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
prices_df['price_momentum_y'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')
del prices_df['month'], prices_df['year']
grid_df = reduce_mem_usage(grid_df)
prices_df = reduce_mem_usage(prices_df)
########################### Merge prices and save part 2
#################################################################################
print('Merge prices and save part 2')
# Merge Prices
original_columns = list(grid_df)
grid_df = grid_df.merge(prices_df, on=['store_id','item_id','wm_yr_wk'], how='left')
keep_columns = [col for col in list(grid_df) if col not in original_columns]
grid_df = grid_df[MAIN_INDEX+keep_columns]
grid_df = reduce_mem_usage(grid_df)
# Safe part 2
grid_df.to_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_2.pkl')
print('Size:', grid_df.shape)
else:
grid_df = pd.read_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_2.pkl')
# We don't need prices_df anymore
del prices_df
if not os.path.exists(f'./datasets/{self.dataset_name}/processed/grid_part_3.pkl'):
### Merge 9 of 14 of the columns of the *calender_df* with the *grid_df*
grid_df = grid_df[MAIN_INDEX]
# Merge calendar partly
icols = ['date',
'd',
'event_name_1',
'event_type_1',
'event_name_2',
'event_type_2',
'snap_CA',
'snap_TX',
'snap_WI']
grid_df = grid_df.merge(calendar_df[icols], on=['d'], how='left')
### 7 columns with categorical values are converted to boolean and numerical values. The *date* column is converted to the datetime format.
# Minify data
# 'snap_' columns we can convert to bool or int8
icols = ['event_name_1',
'event_type_1',
'event_name_2',
'event_type_2',
'snap_CA',
'snap_TX',
'snap_WI']
for col in icols:
grid_df[col] = grid_df[col].astype('category')
# Convert to DateTime
grid_df['date'] = pd.to_datetime(grid_df['date'])
### Create datetime features
# Make some features from date
grid_df['tm_d'] = grid_df['date'].dt.day.astype(np.int8)
grid_df['tm_w'] = grid_df['date'].dt.isocalendar().week.astype(np.int8)
grid_df['tm_m'] = grid_df['date'].dt.month.astype(np.int8)
grid_df['tm_y'] = grid_df['date'].dt.year
grid_df['tm_y'] = (grid_df['tm_y'] - grid_df['tm_y'].min()).astype(np.int8)
grid_df['tm_wm'] = grid_df['tm_d'].apply(lambda x: ceil(x/7)).astype(np.int8) # 오늘 몇째주?
grid_df['tm_dw'] = grid_df['date'].dt.dayofweek.astype(np.int8)
grid_df['tm_w_end'] = (grid_df['tm_dw']>=5).astype(np.int8)
# Remove date
del grid_df['date']
########################### Save part 3 (Dates)
#################################################################################
print('Save part 3')
# Safe part 3
grid_df.to_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_3.pkl')
else:
grid_df = pd.read_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_3.pkl')
print('Size:', grid_df.shape)
# We don't need calendar_df anymore
del calendar_df
del grid_df
SHIFT_DAY = 28
if not os.path.exists(f'./datasets/{self.dataset_name}/processed/lags_df_'+str(SHIFT_DAY)+'.pkl') or self.toy_example:
### converts the days columns from string to int, e.g. d_1 -> 1 - Some additional cleaning
## Part 1
# Convert 'd' to int
grid_df = pd.read_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_1.pkl')
grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)
### Create lag features
# Remove 'wm_yr_wk'
# as test values are not in train set
del grid_df['wm_yr_wk']
grid_df.to_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_1.pkl')
del grid_df
grid_df = pd.read_pickle(f'./datasets/{self.dataset_name}/processed/grid_part_1.pkl')
# We need only 'id','d','sales'
# to make lags and rollings
grid_df = grid_df[['id','d','sales']]
# Lags
# with 28 day shift
print('Create lags')
LAG_DAYS = [col for col in range(SHIFT_DAY,SHIFT_DAY+15)]
grid_df = grid_df.assign(**{
'{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
for l in LAG_DAYS
for col in [TARGET]
})
# Minify lag columns
for col in list(grid_df):
if 'lag' in col:
grid_df[col] = grid_df[col].astype(np.float16)