-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_flow.py
98 lines (80 loc) · 2.51 KB
/
train_flow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from prefect import flow, task
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import bentoml
@task(retries=5)
def load_data(data_path):
# Load the data, sample such that the target classes are equal size
df = pd.read_csv(data_path)
df = pd.concat(
[df[df.isFraud == 0].sample(n=len(df[df.isFraud == 1])), df[df.isFraud == 1]],
axis=0,
)
return df
@task
def train_ohe(df):
# Use one-hot encoding to encode the categorical features
X = df[["ProductCD", "P_emaildomain", "R_emaildomain", "card4", "M1", "M2", "M3"]]
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X)
return enc
@task
def split_data(df, enc):
# Select the features and target, and generate train/test split
X = df[["ProductCD", "P_emaildomain", "R_emaildomain", "card4", "M1", "M2", "M3"]]
X = pd.DataFrame(
enc.transform(X).toarray(), columns=enc.get_feature_names_out().reshape(-1)
)
X["TransactionAmt"] = df[["TransactionAmt"]].to_numpy()
y = df.isFraud
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
return X_train, X_test, y_train, y_test
@task
def train_xgb(X_train, y_train):
# Train the model
xgb = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective="binary:logistic",
nthread=4,
scale_pos_weight=1,
seed=27,
)
model = xgb.fit(X_train, y_train)
return model
@task
def save_model(model, enc, version, stage):
# Save the model with BentoML
saved_model = bentoml.sklearn.save_model(
"fraud_classifier",
model,
labels={"owner": "Cerebrium", "stage": f"{stage}"},
metadata={f"version": f"{version}"},
custom_objects={"ohe_encoder": enc},
signatures={
"predict": {
"batchable": True,
"batch_dim": 0,
}
},
)
print(saved_model)
@flow
def train_flow(data_path, version, stage):
df = load_data(data_path)
enc = train_ohe(df)
X_train, _, y_train, _ = split_data(df, enc)
model = train_xgb(X_train, y_train)
save_model(model, enc, version, stage)
if __name__ == "__main__":
train_flow(data_path="data/train_transaction.csv", version="1.0.1", stage="prod")