-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexperiments.py
148 lines (114 loc) · 5.3 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import mlflow
from mlflow.models import infer_signature
import pandas as pd
from datetime import datetime
from src.prepocessing import preprocessing_df, get_x_y, shifting
from sklearn.preprocessing import StandardScaler
from src.cv import TimeSeriesSlicingCV
from src.utils import get_dynamic_test_set_splitter
from sklearn.feature_selection import RFECV
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
xgb_model = xgb.XGBRegressor() # model selection
## Setting on the start_date
PROVIDED_START_DATE = datetime.strptime("2020-01-04", '%Y-%m-%d')
PROVIDED_END_DATE = datetime.today().strftime('%Y-%m-%d')
generated_date_range = pd.date_range(start=PROVIDED_START_DATE, end=PROVIDED_END_DATE, freq='W-Mon')
## Reading the input and target series file
sample_source_file = # Provided souce file path
n_row_start = 3
df = pd.read_excel(sample_source_file, skiprows= n_row_start)
mapper = pd.read_excel(sample_source_file, sheet_name="mapping")
mapper = mapper.rename(columns={"series_id": "Input Series"})
mapper['Input Series'] = mapper['Input Series'].astype(int)
## Repplace the columns
df_testing = df.copy()
df_testing.columns = df_testing.columns.astype(str)
df_clean = df_testing.rename(columns=lambda x: x.split(' (')[0])
df_clean = df_clean.rename(columns={"Series ID":"date"})
## Getting the latest input date index
df_date = df_clean.copy()["date"].sort_values(ascending=False)
latest_date = df_date.max()
## Replacing the datetime indexing
df_completed_date = pd.date_range(start=PROVIDED_START_DATE, end=latest_date, freq='W-Mon')
df_testing = df_clean.set_index("date", drop=True)
df_testing = df_testing.reindex(df_completed_date)
## Checking the available date or range that all columns is have values
isna_value_count_by_cols = df_testing.isna().sum()
##################### FEATURES ENGINEERING ############################
## Removing the series with not exceed the threshold
series_to_be_drop = isna_value_count_by_cols[isna_value_count_by_cols >= 100]
dropped_series_list = series_to_be_drop.index.to_list() # Can be the df to store the LIST and have the issue on this
dropped_series_df = pd.DataFrame(series_to_be_drop, columns=['missing_timepoint_count'])
dropped_series_df = dropped_series_df.reset_index(drop=False, names="Input Series")
dropped_series_df['Input Series'] = dropped_series_df['Input Series'].astype(int)
print(f"Timepoint that exceed the threshold {dropped_series_list}")
cleaned_df = df_testing.drop(columns=dropped_series_list)
## detect the latest timepoint threashold legitable ### optinal
isna_count_by_row = cleaned_df.isna().sum(axis=1)
# target_input split()
target_series, input_series = get_x_y(cleaned_df)
# Shifting
shifted_input_df = input_series.shift(periods=1, freq='W-Mon')
# OOS mask split
X_train, X_test, y_train= preprocessing_df(shifted_input_df, target_series)
# INPUT series update
interpolated_X_train = X_train.interpolate(method="linear", limit_direction="forward")
interpolated_y_train = y_train.interpolate(method="linear", limit_direction="forward")
interpolated_X_test = X_test.interpolate(method="linear", limit_direction="backward")
# CV
cv_config = {
"initial_window": 40,
"horizon": 6,
"fixed_window": True,
"block_size": 1,
}
cv = TimeSeriesSlicingCV(**cv_config)
# Define model and hyperparameters
model_param = {}
model = xgb.XGBRegressor(**model_param)
split_structur= get_dynamic_test_set_splitter(cv,interpolated_X_train, interpolated_y_train)
y_prediction = []
mse_records = []
mape_records = []
for window_id, (train_index, test_index) in split_structur:
X_train = interpolated_X_train.iloc[train_index]
y_train = interpolated_y_train.iloc[train_index]
X_test = interpolated_X_train.iloc[test_index]
y_test = interpolated_y_train.iloc[test_index]
model_processor = make_pipeline(StandardScaler(), xgb_model)
model_processor.fit(X_train, y_train)
prediction = model_processor.predict(X_test)
y_pred = pd.Series(prediction, index=X_test.index)
y_prediction.append(y_pred)
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse_records.append(pd.Series(mse, index=y_test.index[0]))
mape_records.append(pd.Series(mse, index=y_test.index[0]))
#######################################################################################
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
# Create a new MLflow Experiment
mlflow.set_experiment("us_retail_sales")
# Start an MLflow run
with mlflow.start_run(run_name="LEO"):
# Log the hyperparameters
mlflow.log_params(model_param)
# Log the loss metric
mlflow.log_metric("MSE", mse)
mlflow.log_metric("MAPE", mape)
# Set a tag that we can use to remind ourselves what this run was for
mlflow.set_tag("Training Info", "XGB.Regressor for US Retail Sales")
# Infer the model signature
signature = infer_signature(interpolated_X_train, model.predict(interpolated_X_train))
# Log the model
model_info = mlflow.sklearn.log_model(
sk_model=model,
artifact_path="us_retail_sales_items",
signature=signature,
input_example=interpolated_X_train,
registered_model_name="US_RETAIL_SALES",
)