-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added predictors and their serializers #13
Changes from all commits
25e8f0e
ea77d71
408ae5a
a156a16
2cdca67
2abe978
2677243
ec589d2
6c679d1
6a871e0
1660c77
7d35fd0
418d676
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,10 @@ | ||
coverage==7.6.0 | ||
flake8==7.1.0 | ||
huggingface_hub==0.24.3 | ||
joblib==1.2.0 | ||
numpy==1.23.5 | ||
pandas==1.5.3 | ||
pylint==3.2.6 | ||
pylint==3.2.6 | ||
scikit-learn==1.2.2 | ||
tensorboard==2.13.0 | ||
torch==2.3.1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
""" | ||
Immutable NamedTuple for storing the context, actions, and outcomes for a given project. | ||
Note: We choose to use NamedTuple over dataclasses because NamedTuple is immutable. | ||
""" | ||
from typing import NamedTuple | ||
|
||
|
||
class CAOMapping(NamedTuple): | ||
""" | ||
Class defining the context, actions, and outcomes for a given project. | ||
""" | ||
context: list[str] | ||
actions: list[str] | ||
outcomes: list[str] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
""" | ||
A simple custom PyTorch dataset is created here. This is used to keep our | ||
datasets standard between models. It is used in both Torch prescription | ||
and Neural Network training. | ||
""" | ||
import numpy as np | ||
import torch | ||
from torch.utils.data.dataset import Dataset | ||
|
||
|
||
class TorchDataset(Dataset): | ||
""" | ||
Simple custom torch dataset. | ||
:param X: data | ||
:param y: labels | ||
""" | ||
def __init__(self, X: np.ndarray, y: np.ndarray, device="cpu"): | ||
super().__init__() | ||
self.X = torch.tensor(X, dtype=torch.float32, device=device) | ||
self.y = torch.tensor(y, device=device) | ||
assert len(self.X) == len(self.y), "X and y must have the same length" | ||
|
||
def __len__(self): | ||
return len(self.X) | ||
|
||
def __getitem__(self, idx: int) -> tuple: | ||
return self.X[idx], self.y[idx] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
""" | ||
Serializer for the Neural Network Predictor class. | ||
""" | ||
import json | ||
from pathlib import Path | ||
|
||
import joblib | ||
import torch | ||
|
||
from data.cao_mapping import CAOMapping | ||
from persistence.serializers.serializer import Serializer | ||
from predictors.neural_network.torch_neural_net import TorchNeuralNet | ||
from predictors.neural_network.neural_net_predictor import NeuralNetPredictor | ||
|
||
|
||
class NeuralNetSerializer(Serializer): | ||
""" | ||
Serializer for the NeuralNetPredictor. | ||
Saves config necessary to recreate the model, the model itself, and the scaler for the data to a folder. | ||
""" | ||
def save(self, model: NeuralNetPredictor, path: Path): | ||
""" | ||
Saves model, config, and scaler into format for loading. | ||
Generates path to folder if it does not exist. | ||
:param model: the neural network predictor to save. | ||
:param path: path to folder to save model files. | ||
""" | ||
if model.model is None: | ||
raise ValueError("Model not fitted yet.") | ||
path.mkdir(parents=True, exist_ok=True) | ||
|
||
# Note: we don't save the model's device, as it's not guaranteed to be available on load | ||
config = { | ||
"context": model.cao.context, | ||
"actions": model.cao.actions, | ||
"outcomes": model.cao.outcomes, | ||
"features": model.features, | ||
"label": model.label, | ||
"hidden_sizes": model.hidden_sizes, | ||
"linear_skip": model.linear_skip, | ||
"dropout": model.dropout, | ||
"epochs": model.epochs, | ||
"batch_size": model.batch_size, | ||
"optim_params": model.optim_params, | ||
"train_pct": model.train_pct, | ||
"step_lr_params": model.step_lr_params | ||
} | ||
with open(path / "config.json", "w", encoding="utf-8") as file: | ||
json.dump(config, file) | ||
# Put model on CPU before saving | ||
model.model.to("cpu") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move to CPU so that we don't error if we go from M1 to NVIDIA or change architectures like that |
||
torch.save(model.model.state_dict(), path / "model.pt") | ||
joblib.dump(model.scaler, path / "scaler.joblib") | ||
|
||
def load(self, path: Path) -> NeuralNetPredictor: | ||
""" | ||
Loads a model from a given folder. Creates empty model with config, then loads model state dict and scaler. | ||
NOTE: We don't put the model back on the device it was trained on. This has to be done manually. | ||
:param path: path to folder containing model files. | ||
""" | ||
if not path.exists() or not path.is_dir(): | ||
raise FileNotFoundError(f"Path {path} does not exist.") | ||
if not (path / "config.json").exists() or \ | ||
not (path / "model.pt").exists() or \ | ||
not (path / "scaler.joblib").exists(): | ||
raise FileNotFoundError("Model files not found in path.") | ||
|
||
# Initialize model with config | ||
with open(path / "config.json", "r", encoding="utf-8") as file: | ||
config = json.load(file) | ||
# Grab CAO out of config | ||
cao = CAOMapping(config.pop("context"), config.pop("actions"), config.pop("outcomes")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also reconstruct our cao in loading |
||
nnp = NeuralNetPredictor(cao, config) | ||
|
||
nnp.model = TorchNeuralNet(len(config["features"]), | ||
config["hidden_sizes"], | ||
config["linear_skip"], | ||
config["dropout"]) | ||
# Set map_location to CPU to avoid issues with GPU availability | ||
nnp.model.load_state_dict(torch.load(path / "model.pt", map_location="cpu")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We set the map location to CPU to avoid errors if we're loading from a state dict that was saved while on a different device. This is technically not necessary because we move to CPU on save but helps with backward-compatibility |
||
nnp.model.eval() | ||
nnp.scaler = joblib.load(path / "scaler.joblib") | ||
return nnp |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
""" | ||
Serializer for the SKLearnPredictor class. | ||
""" | ||
import json | ||
from pathlib import Path | ||
|
||
import joblib | ||
|
||
from data.cao_mapping import CAOMapping | ||
from persistence.serializers.serializer import Serializer | ||
from predictors.sklearn_predictors.sklearn_predictor import SKLearnPredictor | ||
|
||
|
||
class SKLearnSerializer(Serializer): | ||
""" | ||
Serializer for the SKLearnPredictor. | ||
Uses joblib to save the model and json to save the config used to load it. | ||
""" | ||
def save(self, model: SKLearnPredictor, path: Path): | ||
""" | ||
Saves saves model and features into format for loading. | ||
Generates path to folder if it does not exist. | ||
:param path: path to folder to save model files. | ||
""" | ||
path.mkdir(parents=True, exist_ok=True) | ||
|
||
# Add CAO to the config | ||
config = dict(model.config.items()) | ||
cao_dict = {"context": model.cao.context, "actions": model.cao.actions, "outcomes": model.cao.outcomes} | ||
config.update(cao_dict) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dump our cao into the config now |
||
|
||
with open(path / "config.json", "w", encoding="utf-8") as file: | ||
json.dump(config, file) | ||
joblib.dump(model.model, path / "model.joblib") | ||
|
||
def load(self, path: Path) -> "SKLearnPredictor": | ||
""" | ||
Loads saved model and config from a local folder. | ||
:param path: path to folder to load model files from. | ||
""" | ||
load_path = Path(path) | ||
if not load_path.exists() or not load_path.is_dir(): | ||
raise FileNotFoundError(f"Path {path} does not exist.") | ||
if not (load_path / "config.json").exists() or not (load_path / "model.joblib").exists(): | ||
raise FileNotFoundError("Model files not found in path.") | ||
|
||
# Extract CAO from config | ||
with open(load_path / "config.json", "r", encoding="utf-8") as file: | ||
config = json.load(file) | ||
cao = CAOMapping(config.pop("context"), config.pop("actions"), config.pop("outcomes")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We reconstruct cao when loading |
||
|
||
model = joblib.load(load_path / "model.joblib") | ||
sklearn_predictor = SKLearnPredictor(cao, model, config) | ||
return sklearn_predictor |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We store the context, actions, and outcomes in our serialization now