-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict.py
138 lines (116 loc) · 4.16 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import pickle
from typing import Tuple
from tensorflow import keras
import numpy as np
import pandas as pd
# Load in serialized model, config, and scaler
model = keras.models.load_model("model")
with open("config.json", "r") as f:
CONFIG = json.load(f)
with open("scaler.pck", "rb") as f:
scaler = pickle.load(f)
# Set global variables
TIMESTEPS = CONFIG["timesteps"]
SOLAR_WIND_FEATURES = [
"bt",
"temperature",
"bx_gse",
"by_gse",
"bz_gse",
"speed",
"density",
]
XCOLS = (
[col + "_mean" for col in SOLAR_WIND_FEATURES]
+ [col + "_std" for col in SOLAR_WIND_FEATURES]
+ ["smoothed_ssn"]
)
# Define functions for preprocessing
def impute_features(feature_df):
"""Imputes data using the following methods:
- `smoothed_ssn`: forward fill
- `solar_wind`: interpolation
"""
# forward fill sunspot data for the rest of the month
feature_df.smoothed_ssn = feature_df.smoothed_ssn.fillna(method="ffill")
# interpolate between missing solar wind values
feature_df = feature_df.interpolate()
return feature_df
def aggregate_hourly(feature_df, aggs=["mean", "std"]):
"""Aggregates features to the floor of each hour using mean and standard deviation.
e.g. All values from "11:00:00" to "11:59:00" will be aggregated to "11:00:00".
"""
# group by the floor of each hour use timedelta index
agged = feature_df.groupby([feature_df.index.floor("H")]).agg(aggs)
# flatten hierachical column index
agged.columns = ["_".join(x) for x in agged.columns]
return agged
def preprocess_features(solar_wind, sunspots, scaler=None, subset=None):
"""
Preprocessing steps:
- Subset the data
- Aggregate hourly
- Join solar wind and sunspot data
- Scale using standard scaler
- Impute missing values
"""
# select features we want to use
if subset:
solar_wind = solar_wind[subset]
# aggregate solar wind data and join with sunspots
hourly_features = aggregate_hourly(solar_wind).join(sunspots)
# subtract mean and divide by standard deviation
if scaler is None:
scaler = StandardScaler()
scaler.fit(hourly_features)
normalized = pd.DataFrame(
scaler.transform(hourly_features),
index=hourly_features.index,
columns=hourly_features.columns,
)
# impute missing values
imputed = impute_features(normalized)
# we want to return the scaler object as well to use later during prediction
return imputed, scaler
# THIS MUST BE DEFINED FOR YOUR SUBMISSION TO RUN
def predict_dst(
solar_wind_7d: pd.DataFrame,
satellite_positions_7d: pd.DataFrame,
latest_sunspot_number: float,
) -> Tuple[float, float]:
"""
Take all of the data up until time t-1, and then make predictions for
times t and t+1.
Parameters
----------
solar_wind_7d: pd.DataFrame
The last 7 days of satellite data up until (t - 1) minutes [exclusive of t]
satellite_positions_7d: pd.DataFrame
The last 7 days of satellite position data up until the present time [inclusive of t]
latest_sunspot_number: float
The latest monthly sunspot number (SSN) to be available
Returns
-------
predictions : Tuple[float, float]
A tuple of two predictions, for (t and t + 1 hour) respectively; these should
be between -2,000 and 500.
"""
# Re-format data to fit into our pipeline
sunspots = pd.DataFrame(index=solar_wind_7d.index, columns=["smoothed_ssn"])
sunspots["smoothed_ssn"] = latest_sunspot_number
# Process our features and grab last 32 (timesteps) hours
features, s = preprocess_features(
solar_wind_7d, sunspots, scaler=scaler, subset=SOLAR_WIND_FEATURES
)
model_input = features[-TIMESTEPS:][XCOLS].values.reshape(
(1, TIMESTEPS, features.shape[1])
)
# Make a prediction
prediction_at_t0, prediction_at_t1 = model.predict(model_input)[0]
# Optional check for unexpected values
if not np.isfinite(prediction_at_t0):
prediction_at_t0 = -12
if not np.isfinite(prediction_at_t1):
prediction_at_t1 = -12
return prediction_at_t0, prediction_at_t1