Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added predictors and their serializers #13

Merged
merged 13 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/sdk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ jobs:
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Set PYTHONPATH
run: echo "PYTHONPATH=$PWD/src/prsdk" >> $GITHUB_ENV
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ suggestion-mode=yes
disable=

# Default set of "always good" names
good-names=_,X_train,X_test
good-names=_,X_train,X_test,X,X_val,X_train_scaled,X_test_scaled,X_val_scaled

recursive=y
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
coverage==7.6.0
flake8==7.1.0
huggingface_hub==0.24.3
joblib==1.2.0
numpy==1.23.5
pandas==1.5.3
pylint==3.2.6
pylint==3.2.6
scikit-learn==1.2.2
tensorboard==2.13.0
torch==2.3.1
14 changes: 14 additions & 0 deletions src/prsdk/data/cao_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Immutable NamedTuple for storing the context, actions, and outcomes for a given project.
Note: We choose to use NamedTuple over dataclasses because NamedTuple is immutable.
"""
from typing import NamedTuple


class CAOMapping(NamedTuple):
"""
Class defining the context, actions, and outcomes for a given project.
"""
context: list[str]
actions: list[str]
outcomes: list[str]
27 changes: 27 additions & 0 deletions src/prsdk/data/torch_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
A simple custom PyTorch dataset is created here. This is used to keep our
datasets standard between models. It is used in both Torch prescription
and Neural Network training.
"""
import numpy as np
import torch
from torch.utils.data.dataset import Dataset


class TorchDataset(Dataset):
"""
Simple custom torch dataset.
:param X: data
:param y: labels
"""
def __init__(self, X: np.ndarray, y: np.ndarray, device="cpu"):
super().__init__()
self.X = torch.tensor(X, dtype=torch.float32, device=device)
self.y = torch.tensor(y, device=device)
assert len(self.X) == len(self.y), "X and y must have the same length"

def __len__(self):
return len(self.X)

def __getitem__(self, idx: int) -> tuple:
return self.X[idx], self.y[idx]
1 change: 0 additions & 1 deletion src/prsdk/persistence/persistors/hf_persistor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def from_pretrained(self, path_or_url: str, **hf_args):

url_path = path_or_url.replace("/", "--")
local_dir = hf_args.get("local_dir", f"~/.cache/huggingface/project-resilience/{url_path}")

if not Path(local_dir).exists() or not Path(local_dir).is_dir():
hf_args["local_dir"] = local_dir
snapshot_download(repo_id=path_or_url, **hf_args)
Expand Down
83 changes: 83 additions & 0 deletions src/prsdk/persistence/serializers/neural_network_serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Serializer for the Neural Network Predictor class.
"""
import json
from pathlib import Path

import joblib
import torch

from data.cao_mapping import CAOMapping
from persistence.serializers.serializer import Serializer
from predictors.neural_network.torch_neural_net import TorchNeuralNet
from predictors.neural_network.neural_net_predictor import NeuralNetPredictor


class NeuralNetSerializer(Serializer):
"""
Serializer for the NeuralNetPredictor.
Saves config necessary to recreate the model, the model itself, and the scaler for the data to a folder.
"""
def save(self, model: NeuralNetPredictor, path: Path):
"""
Saves model, config, and scaler into format for loading.
Generates path to folder if it does not exist.
:param model: the neural network predictor to save.
:param path: path to folder to save model files.
"""
if model.model is None:
raise ValueError("Model not fitted yet.")
path.mkdir(parents=True, exist_ok=True)

# Note: we don't save the model's device, as it's not guaranteed to be available on load
config = {
"context": model.cao.context,
"actions": model.cao.actions,
"outcomes": model.cao.outcomes,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We store the context, actions, and outcomes in our serialization now

"features": model.features,
"label": model.label,
"hidden_sizes": model.hidden_sizes,
"linear_skip": model.linear_skip,
"dropout": model.dropout,
"epochs": model.epochs,
"batch_size": model.batch_size,
"optim_params": model.optim_params,
"train_pct": model.train_pct,
"step_lr_params": model.step_lr_params
}
with open(path / "config.json", "w", encoding="utf-8") as file:
json.dump(config, file)
# Put model on CPU before saving
model.model.to("cpu")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move to CPU so that we don't error if we go from M1 to NVIDIA or change architectures like that

torch.save(model.model.state_dict(), path / "model.pt")
joblib.dump(model.scaler, path / "scaler.joblib")

def load(self, path: Path) -> NeuralNetPredictor:
"""
Loads a model from a given folder. Creates empty model with config, then loads model state dict and scaler.
NOTE: We don't put the model back on the device it was trained on. This has to be done manually.
:param path: path to folder containing model files.
"""
if not path.exists() or not path.is_dir():
raise FileNotFoundError(f"Path {path} does not exist.")
if not (path / "config.json").exists() or \
not (path / "model.pt").exists() or \
not (path / "scaler.joblib").exists():
raise FileNotFoundError("Model files not found in path.")

# Initialize model with config
with open(path / "config.json", "r", encoding="utf-8") as file:
config = json.load(file)
# Grab CAO out of config
cao = CAOMapping(config.pop("context"), config.pop("actions"), config.pop("outcomes"))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also reconstruct our cao in loading

nnp = NeuralNetPredictor(cao, config)

nnp.model = TorchNeuralNet(len(config["features"]),
config["hidden_sizes"],
config["linear_skip"],
config["dropout"])
# Set map_location to CPU to avoid issues with GPU availability
nnp.model.load_state_dict(torch.load(path / "model.pt", map_location="cpu"))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We set the map location to CPU to avoid errors if we're loading from a state dict that was saved while on a different device. This is technically not necessary because we move to CPU on save but helps with backward-compatibility

nnp.model.eval()
nnp.scaler = joblib.load(path / "scaler.joblib")
return nnp
54 changes: 54 additions & 0 deletions src/prsdk/persistence/serializers/sklearn_serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""
Serializer for the SKLearnPredictor class.
"""
import json
from pathlib import Path

import joblib

from data.cao_mapping import CAOMapping
from persistence.serializers.serializer import Serializer
from predictors.sklearn_predictors.sklearn_predictor import SKLearnPredictor


class SKLearnSerializer(Serializer):
"""
Serializer for the SKLearnPredictor.
Uses joblib to save the model and json to save the config used to load it.
"""
def save(self, model: SKLearnPredictor, path: Path):
"""
Saves saves model and features into format for loading.
Generates path to folder if it does not exist.
:param path: path to folder to save model files.
"""
path.mkdir(parents=True, exist_ok=True)

# Add CAO to the config
config = dict(model.config.items())
cao_dict = {"context": model.cao.context, "actions": model.cao.actions, "outcomes": model.cao.outcomes}
config.update(cao_dict)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dump our cao into the config now


with open(path / "config.json", "w", encoding="utf-8") as file:
json.dump(config, file)
joblib.dump(model.model, path / "model.joblib")

def load(self, path: Path) -> "SKLearnPredictor":
"""
Loads saved model and config from a local folder.
:param path: path to folder to load model files from.
"""
load_path = Path(path)
if not load_path.exists() or not load_path.is_dir():
raise FileNotFoundError(f"Path {path} does not exist.")
if not (load_path / "config.json").exists() or not (load_path / "model.joblib").exists():
raise FileNotFoundError("Model files not found in path.")

# Extract CAO from config
with open(load_path / "config.json", "r", encoding="utf-8") as file:
config = json.load(file)
cao = CAOMapping(config.pop("context"), config.pop("actions"), config.pop("outcomes"))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We reconstruct cao when loading


model = joblib.load(load_path / "model.joblib")
sklearn_predictor = SKLearnPredictor(cao, model, config)
return sklearn_predictor
Loading
Loading