From a7ffdf7c827f616f396f2cb44f710a570be0167e Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Tue, 30 Jul 2024 16:14:02 -0700 Subject: [PATCH] Removed references to CAO --- src/prsdk/data/cao_mapping.py | 14 -------------- .../serializers/neural_network_serializer.py | 8 +------- .../persistence/serializers/sklearn_serializer.py | 12 ++---------- .../neural_network/neural_net_predictor.py | 8 ++------ src/prsdk/predictors/predictor.py | 13 ++----------- .../linear_regression_predictor.py | 6 ++---- .../sklearn_predictors/random_forest_predictor.py | 6 ++---- .../sklearn_predictors/sklearn_predictor.py | 5 ++--- src/prsdk/prescriptors/prescriptor.py | 14 ++------------ tests/persistence/test_hf_persistence.py | 1 - tests/persistence/test_predictor_serialization.py | 6 ++---- tests/predictors/test_neural_net.py | 10 +++------- 12 files changed, 20 insertions(+), 83 deletions(-) delete mode 100644 src/prsdk/data/cao_mapping.py diff --git a/src/prsdk/data/cao_mapping.py b/src/prsdk/data/cao_mapping.py deleted file mode 100644 index 2709547..0000000 --- a/src/prsdk/data/cao_mapping.py +++ /dev/null @@ -1,14 +0,0 @@ -""" -Immutable NamedTuple for storing the context, actions, and outcomes for a given project. -Note: We choose to use NamedTuple over dataclasses because NamedTuple is immutable. -""" -from typing import NamedTuple - - -class CAOMapping(NamedTuple): - """ - Class defining the context, actions, and outcomes for a given project. - """ - context: list[str] - actions: list[str] - outcomes: list[str] diff --git a/src/prsdk/persistence/serializers/neural_network_serializer.py b/src/prsdk/persistence/serializers/neural_network_serializer.py index b39e03c..8ac0af9 100644 --- a/src/prsdk/persistence/serializers/neural_network_serializer.py +++ b/src/prsdk/persistence/serializers/neural_network_serializer.py @@ -7,7 +7,6 @@ import joblib import torch -from data.cao_mapping import CAOMapping from persistence.serializers.serializer import Serializer from predictors.neural_network.torch_neural_net import TorchNeuralNet from predictors.neural_network.neural_net_predictor import NeuralNetPredictor @@ -31,9 +30,6 @@ def save(self, model: NeuralNetPredictor, path: Path): # Note: we don't save the model's device, as it's not guaranteed to be available on load config = { - "context": model.cao.context, - "actions": model.cao.actions, - "outcomes": model.cao.outcomes, "features": model.features, "label": model.label, "hidden_sizes": model.hidden_sizes, @@ -68,9 +64,7 @@ def load(self, path: Path) -> NeuralNetPredictor: # Initialize model with config with open(path / "config.json", "r", encoding="utf-8") as file: config = json.load(file) - # Grab CAO out of config - cao = CAOMapping(config.pop("context"), config.pop("actions"), config.pop("outcomes")) - nnp = NeuralNetPredictor(cao, config) + nnp = NeuralNetPredictor(config) nnp.model = TorchNeuralNet(len(config["features"]), config["hidden_sizes"], diff --git a/src/prsdk/persistence/serializers/sklearn_serializer.py b/src/prsdk/persistence/serializers/sklearn_serializer.py index 8bccc6f..1ad01d8 100644 --- a/src/prsdk/persistence/serializers/sklearn_serializer.py +++ b/src/prsdk/persistence/serializers/sklearn_serializer.py @@ -6,7 +6,6 @@ import joblib -from data.cao_mapping import CAOMapping from persistence.serializers.serializer import Serializer from predictors.sklearn_predictors.sklearn_predictor import SKLearnPredictor @@ -24,13 +23,8 @@ def save(self, model: SKLearnPredictor, path: Path): """ path.mkdir(parents=True, exist_ok=True) - # Add CAO to the config - config = dict(model.config.items()) - cao_dict = {"context": model.cao.context, "actions": model.cao.actions, "outcomes": model.cao.outcomes} - config.update(cao_dict) - with open(path / "config.json", "w", encoding="utf-8") as file: - json.dump(config, file) + json.dump(model.config, file) joblib.dump(model.model, path / "model.joblib") def load(self, path: Path) -> "SKLearnPredictor": @@ -44,11 +38,9 @@ def load(self, path: Path) -> "SKLearnPredictor": if not (load_path / "config.json").exists() or not (load_path / "model.joblib").exists(): raise FileNotFoundError("Model files not found in path.") - # Extract CAO from config with open(load_path / "config.json", "r", encoding="utf-8") as file: config = json.load(file) - cao = CAOMapping(config.pop("context"), config.pop("actions"), config.pop("outcomes")) model = joblib.load(load_path / "model.joblib") - sklearn_predictor = SKLearnPredictor(cao, model, config) + sklearn_predictor = SKLearnPredictor(model, config) return sklearn_predictor diff --git a/src/prsdk/predictors/neural_network/neural_net_predictor.py b/src/prsdk/predictors/neural_network/neural_net_predictor.py index 8542ec6..159336f 100644 --- a/src/prsdk/predictors/neural_network/neural_net_predictor.py +++ b/src/prsdk/predictors/neural_network/neural_net_predictor.py @@ -14,7 +14,6 @@ from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -from data.cao_mapping import CAOMapping from data.torch_data import TorchDataset from predictors.predictor import Predictor from predictors.neural_network.torch_neural_net import TorchNeuralNet @@ -29,11 +28,8 @@ class NeuralNetPredictor(Predictor): Data is automatically standardized and the scaler is saved with the model. TODO: We want to be able to have custom scaling in the future. """ - def __init__(self, cao: CAOMapping, model_config: dict): + def __init__(self, model_config: dict): """ - :param context: list of context features. - :param actions: list of action features. - :param outcomes: list of outcomes to predict. :param model_config: dictionary of model configuration parameters. Model config should contain the following: features: list of features to use in the model (optional, defaults to all context + actions) @@ -48,7 +44,7 @@ def __init__(self, cao: CAOMapping, model_config: dict): train_pct: percentage of training data to use (defaults to 1) step_lr_params: dictionary of parameters to pass to the step learning rate scheduler (defaults to 1, 0.1) """ - super().__init__(cao) + super().__init__() self.features = model_config.get("features", None) self.label = model_config.get("label", None) diff --git a/src/prsdk/predictors/predictor.py b/src/prsdk/predictors/predictor.py index 486cb93..72e129d 100644 --- a/src/prsdk/predictors/predictor.py +++ b/src/prsdk/predictors/predictor.py @@ -1,26 +1,17 @@ """ -Abstract class for predictors to inherit from. +Interface for predictors to implement. """ from abc import ABC, abstractmethod import pandas as pd -from data.cao_mapping import CAOMapping - class Predictor(ABC): """ - Abstract class for predictors to inherit from. + Interface for predictors to implement. Predictors must be able to be fit and predict on a DataFrame. It is up to the Predictor to keep track of the proper label to label the output DataFrame. """ - def __init__(self, cao: CAOMapping): - """ - Initializes the Predictor with the context, actions, and outcomes. - :param cao: CAOMapping object with context, actions, and outcomes. - """ - self.cao = cao - @abstractmethod def fit(self, X_train: pd.DataFrame, y_train: pd.Series): """ diff --git a/src/prsdk/predictors/sklearn_predictors/linear_regression_predictor.py b/src/prsdk/predictors/sklearn_predictors/linear_regression_predictor.py index 490d878..7ae1e8b 100644 --- a/src/prsdk/predictors/sklearn_predictors/linear_regression_predictor.py +++ b/src/prsdk/predictors/sklearn_predictors/linear_regression_predictor.py @@ -3,7 +3,6 @@ """ from sklearn.linear_model import LinearRegression -from data.cao_mapping import CAOMapping from predictors.sklearn_predictors.sklearn_predictor import SKLearnPredictor @@ -12,9 +11,8 @@ class LinearRegressionPredictor(SKLearnPredictor): Simple linear regression predictor. See SKLearnPredictor for more details. """ - def __init__(self, cao: CAOMapping, model_config: dict): + def __init__(self, model_config: dict): """ - :param cao: CAOMapping object with context, actions, and outcomes for super constructor. :param model_config: Configuration to pass into the SKLearn constructor. Also contains the keys "features" and "label" to keep track of the features and label to predict. """ @@ -22,4 +20,4 @@ def __init__(self, cao: CAOMapping, model_config: dict): model_config = {} lr_config = {key: value for key, value in model_config.items() if key not in ["features", "label"]} model = LinearRegression(**lr_config) - super().__init__(cao, model, model_config) + super().__init__(model, model_config) diff --git a/src/prsdk/predictors/sklearn_predictors/random_forest_predictor.py b/src/prsdk/predictors/sklearn_predictors/random_forest_predictor.py index 37e46c0..6ffdb2a 100644 --- a/src/prsdk/predictors/sklearn_predictors/random_forest_predictor.py +++ b/src/prsdk/predictors/sklearn_predictors/random_forest_predictor.py @@ -3,7 +3,6 @@ """ from sklearn.ensemble import RandomForestRegressor -from data.cao_mapping import CAOMapping from predictors.sklearn_predictors.sklearn_predictor import SKLearnPredictor @@ -12,12 +11,11 @@ class RandomForestPredictor(SKLearnPredictor): Simple random forest predictor. See SKLearnPredictor for more details. """ - def __init__(self, cao: CAOMapping, model_config: dict): + def __init__(self, model_config: dict): """ - :param cao: CAOMapping object with context, actions, and outcomes for super constructor. :param model_config: Configuration to pass into the SKLearn constructor. Also contains the keys "features" and "label" to keep track of the features and label to predict. """ rf_config = {key: value for key, value in model_config.items() if key not in ["features", "label"]} model = RandomForestRegressor(**rf_config) - super().__init__(cao, model, model_config) + super().__init__(model, model_config) diff --git a/src/prsdk/predictors/sklearn_predictors/sklearn_predictor.py b/src/prsdk/predictors/sklearn_predictors/sklearn_predictor.py index 6e64d27..142252e 100644 --- a/src/prsdk/predictors/sklearn_predictors/sklearn_predictor.py +++ b/src/prsdk/predictors/sklearn_predictors/sklearn_predictor.py @@ -6,7 +6,6 @@ import pandas as pd -from data.cao_mapping import CAOMapping from predictors.predictor import Predictor @@ -15,14 +14,14 @@ class SKLearnPredictor(Predictor, ABC): Simple abstract class for sklearn predictors. Keeps track of features fit on and label to predict. """ - def __init__(self, cao: CAOMapping, model, model_config: dict): + def __init__(self, model, model_config: dict): """ Model config contains the following: features: list of features to use for prediction (optional, defaults to all features) label: name of the label to predict (optional, defaults to passed label during fit) Any other parameters are passed to the model. """ - super().__init__(cao) + super().__init__() self.config = model_config self.model = model diff --git a/src/prsdk/prescriptors/prescriptor.py b/src/prsdk/prescriptors/prescriptor.py index 6d06c6e..02dbfb3 100644 --- a/src/prsdk/prescriptors/prescriptor.py +++ b/src/prsdk/prescriptors/prescriptor.py @@ -1,26 +1,16 @@ """ -Abstract prescriptor class to be implemented. +Interface prescriptor to be implemented. """ from abc import ABC, abstractmethod import pandas as pd -from data.cao_mapping import CAOMapping - # pylint: disable=too-few-public-methods class Prescriptor(ABC): """ - Abstract class for prescriptors to allow us to experiment with different implementations. + Interface for prescriptors to implement. """ - def __init__(self, cao: CAOMapping): - """ - We keep track of the context, actions, and outcomes in the CAO mapping to ensure the prescriptor is compatible - with the project it's in. - :param cao: CAOMapping object with context, actions, and outcomes. - """ - self.cao = cao - @abstractmethod def prescribe(self, context_df: pd.DataFrame) -> pd.DataFrame: """ diff --git a/tests/persistence/test_hf_persistence.py b/tests/persistence/test_hf_persistence.py index 9ef829e..9d700b5 100644 --- a/tests/persistence/test_hf_persistence.py +++ b/tests/persistence/test_hf_persistence.py @@ -17,7 +17,6 @@ class TestHuggingFacePersistence(unittest.TestCase): """ Tests the HuggingFace Persistor. We can't test the actual upload but we can test the download with an arbitrary model from HuggingFace. - TODO: We have to update our models to match the new configs that save CAO """ def setUp(self): self.temp_dir = Path("tests/temp") diff --git a/tests/persistence/test_predictor_serialization.py b/tests/persistence/test_predictor_serialization.py index 01dcf43..0cd4513 100644 --- a/tests/persistence/test_predictor_serialization.py +++ b/tests/persistence/test_predictor_serialization.py @@ -7,7 +7,6 @@ import pandas as pd -from data.cao_mapping import CAOMapping from persistence.serializers.neural_network_serializer import NeuralNetSerializer from persistence.serializers.sklearn_serializer import SKLearnSerializer from predictors.neural_network.neural_net_predictor import NeuralNetPredictor @@ -25,7 +24,6 @@ def setUp(self): 2 models with the same parameters, load one from the other's save, and check if their predictions are the same. """ - self.cao = CAOMapping(["a", "b"], ["c"], ["label"]) self.models = [ NeuralNetPredictor, LinearRegressionPredictor, @@ -56,7 +54,7 @@ def test_save_file_names(self): ] for model, serializer, config, test_names in zip(self.models, self.serializers, self.configs, save_file_names): with self.subTest(model=model): - predictor = model(self.cao, config) + predictor = model(config) predictor.fit(self.dummy_data, self.dummy_target) serializer.save(predictor, self.temp_path) files = [f.name for f in self.temp_path.glob("**/*") if f.is_file()] @@ -71,7 +69,7 @@ def test_loaded_same(self): """ for model, serializer, config in zip(self.models, self.serializers, self.configs): with self.subTest(model=model): - predictor = model(self.cao, config) + predictor = model(config) predictor.fit(self.dummy_data.iloc[:2], self.dummy_target.iloc[:2]) output = predictor.predict(self.dummy_data.iloc[2:]) serializer.save(predictor, self.temp_path) diff --git a/tests/predictors/test_neural_net.py b/tests/predictors/test_neural_net.py index c1bf23e..ead24d3 100644 --- a/tests/predictors/test_neural_net.py +++ b/tests/predictors/test_neural_net.py @@ -5,7 +5,6 @@ import pandas as pd -from data.cao_mapping import CAOMapping from predictors.neural_network.neural_net_predictor import NeuralNetPredictor @@ -13,14 +12,11 @@ class TestNeuralNet(unittest.TestCase): """ Specifically tests the neural net predictor """ - def setUp(self): - self.cao = CAOMapping(["a", "b"], ["c"], ["label"]) - def test_single_input(self): """ Tests the neural net with a single input. """ - predictor = NeuralNetPredictor(self.cao, {"hidden_sizes": [4], "epochs": 1, "batch_size": 1, "device": "cpu"}) + predictor = NeuralNetPredictor({"hidden_sizes": [4], "epochs": 1, "batch_size": 1, "device": "cpu"}) train_data = pd.DataFrame({"a": [1], "b": [2], "c": [3], "label": [4]}) test_data = pd.DataFrame({"a": [4], "b": [5], "c": [6]}) @@ -33,7 +29,7 @@ def test_multi_input(self): """ Tests the neural net with multiple inputs. """ - predictor = NeuralNetPredictor(self.cao, {"hidden_sizes": [4], "epochs": 1, "batch_size": 1, "device": "cpu"}) + predictor = NeuralNetPredictor({"hidden_sizes": [4], "epochs": 1, "batch_size": 1, "device": "cpu"}) train_data = pd.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4], "label": [4, 5]}) test_data = pd.DataFrame({"a": [4, 5], "b": [5, 6], "c": [6, 7]}) @@ -46,7 +42,7 @@ def test_batched_input(self): """ Tests the neural network with batched inputs. """ - predictor = NeuralNetPredictor(self.cao, {"hidden_sizes": [4], "epochs": 1, "batch_size": 2, "device": "cpu"}) + predictor = NeuralNetPredictor({"hidden_sizes": [4], "epochs": 1, "batch_size": 2, "device": "cpu"}) train_data = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5], "label": [4, 5, 6]}) test_data = pd.DataFrame({"a": [4, 5], "b": [5, 6], "c": [6, 7]})