-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
66 lines (58 loc) · 1.76 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import bentoml
# Load the data, sample such that the target classes are equal size
df = pd.read_csv("data/train_transaction.csv")
df = pd.concat(
[df[df.isFraud == 0].sample(n=len(df[df.isFraud == 1])), df[df.isFraud == 1]],
axis=0,
)
# Select the features and target
X = df[["ProductCD", "P_emaildomain", "R_emaildomain", "card4", "M1", "M2", "M3"]]
y = df.isFraud
# Use one-hot encoding to encode the categorical features
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X)
X = pd.DataFrame(
enc.transform(X).toarray(), columns=enc.get_feature_names_out().reshape(-1)
)
X["TransactionAmt"] = df[["TransactionAmt"]].to_numpy()
# Split the dataset and train the model
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
xgb = XGBClassifier()
xgb_model = xgb.fit(X_train, y_train)
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)
# Save the models
xgb_save = bentoml.sklearn.save_model(
"fraud_classifier_xgb",
xgb_model,
labels={"owner": "Cerebrium", "stage": "prod"},
metadata={"version": "1.0.0"},
custom_objects={"ohe_encoder": enc},
signatures={
"predict": {
"batchable": True,
"batch_dim": 0,
}
},
)
rf_save = bentoml.sklearn.save_model(
"fraud_classifier_rf",
rf_model,
labels={"owner": "Cerebrium", "stage": "prod"},
metadata={"version": "1.0.0"},
custom_objects={"ohe_encoder": enc},
signatures={
"predict": {
"batchable": True,
"batch_dim": 0,
}
},
)