e-mission · aGuttman · Nov 17, 2022 · Nov 23, 2022 · Nov 23, 2022 · Nov 23, 2022
diff --git a/emission/analysis/classification/inference/labels/inferrers.py b/emission/analysis/classification/inference/labels/inferrers.py
@@ -156,3 +156,19 @@ def predict_cluster_confidence_discounting(trip, max_confidence=None, first_conf
     labels = copy.deepcopy(labels)
     for l in labels: l["p"] *= confidence_coeff
     return labels
+
+def predict_gradient_boosted_decision_tree(trip, max_confidence=None, first_confidence=None, confidence_multiplier=None):
+    # load application config 
+    model_type = eamtc.get_model_type()
+    model_storage = eamtc.get_model_storage()
+    labels, n = eamur.predict_labels_with_gbdt(trip, model_type, model_storage)
+    if n <= 0:  # No model data or trip didn't match a cluster
+        logging.debug(f"In predict_gradient_boosted_decision_tree: n={n}; returning as-is")
+        return labels
+
+    # confidence_coeff = n_to_confidence_coeff(n, max_confidence, first_confidence, confidence_multiplier)
+    # logging.debug(f"In predict_cluster_confidence_discounting: n={n}; discounting with coefficient {confidence_coeff}")
+
+    labels = copy.deepcopy(labels)
+    for l in labels: l["p"] *= confidence_coeff
+    return labels
diff --git a/emission/analysis/classification/inference/labels/pipeline_replace_mode.py b/emission/analysis/classification/inference/labels/pipeline_replace_mode.py
@@ -0,0 +1,102 @@
+# Standard imports
+import logging
+import random
+import copy
+
+# Our imports
+import emission.storage.pipeline_queries as epq
+import emission.storage.timeseries.abstract_timeseries as esta
+import emission.storage.decorations.analysis_timeseries_queries as esda
+import emission.core.wrapper.labelprediction as ecwl
+import emission.core.wrapper.entry as ecwe
+import emission.analysis.classification.inference.labels.inferrers as eacili
+import emission.analysis.classification.inference.labels.ensembles as eacile
+
+
+# For each algorithm in ecwl.AlgorithmTypes that runs on a trip (e.g., not the ensemble, which
+# runs on the results of other algorithms), primary_algorithms specifies a corresponding
+# function in eacili to run. This makes it easy to plug in additional algorithms later.
+primary_algorithms = {
+    ecwl.AlgorithmTypes.GRADIENT_BOOSTED_DECISION_TREE: eacili.predict_gradient_boosted_decision_tree
+}
+
+# ensemble specifies which algorithm in eacile to run.
+# This makes it easy to test various ways of combining various algorithms.
+ensemble = eacile.ensemble_first_prediction
+
+
+# Does all the work necessary for a given user
+def infer_labels(user_id):
+    time_query = epq.get_time_range_for_label_inference(user_id)
+    try:
+        lip = LabelInferencePipeline()
+        lip.user_id = user_id
+        lip.run_prediction_pipeline(user_id, time_query)
+        if lip.last_trip_done is None:
+            logging.debug("After run, last_trip_done == None, must be early return")
+        epq.mark_label_inference_done(user_id, lip.last_trip_done)
+    except:
+        logging.exception("Error while inferring labels, timestamp is unchanged")
+        epq.mark_label_inference_failed(user_id)
+
+# Code structure based on emission.analysis.classification.inference.mode.pipeline
+# and emission.analysis.classification.inference.mode.rule_engine
+class LabelInferencePipeline:
+    def __init__(self):
+        self._last_trip_done = None
+
+    @property
+    def last_trip_done(self):
+        return self._last_trip_done
+
+    # For a given user and time range, runs all the primary algorithms and ensemble, saves results
+    # to the database, and records progress
+    def run_prediction_pipeline(self, user_id, time_range):
+        self.ts = esta.TimeSeries.get_time_series(user_id)
+        self.toPredictTrips = esda.get_entries(
+            esda.CLEANED_TRIP_KEY, user_id, time_query=time_range)
+        for cleaned_trip in self.toPredictTrips:
+            # Create an inferred trip
+            cleaned_trip_dict = copy.copy(cleaned_trip)["data"]
+            inferred_trip = ecwe.Entry.create_entry(user_id, "analysis/inferred_trip", cleaned_trip_dict)
+
+            # Run the algorithms and the ensemble, store results
+            results = self.compute_and_save_algorithms(inferred_trip)
+            ensemble = self.compute_and_save_ensemble(inferred_trip, results)
+
+            # Put final results into the inferred trip and store it
+            inferred_trip["data"]["cleaned_trip"] = cleaned_trip.get_id()
+            inferred_trip["data"]["inferred_labels"] = ensemble["prediction"]
+            self.ts.insert(inferred_trip)
+
+            if self._last_trip_done is None or self._last_trip_done["data"]["end_ts"] < cleaned_trip["data"]["end_ts"]:
+                self._last_trip_done = cleaned_trip
+
+    # This is where the labels for a given trip are actually predicted.
+    # Though the only information passed in is the trip object, the trip object can provide the
+    # user_id and other potentially useful information.
+    def compute_and_save_algorithms(self, trip):
+        predictions = []
+        for algorithm_id, algorithm_fn in primary_algorithms.items():
+            prediction = algorithm_fn(trip)
+            lp = ecwl.Labelprediction()
+            lp.trip_id = trip.get_id()
+            lp.algorithm_id = algorithm_id
+            lp.prediction = prediction
+            lp.start_ts = trip["data"]["start_ts"]
+            lp.end_ts = trip["data"]["end_ts"]
+            self.ts.insert_data(self.user_id, "inference/labels", lp)
+            predictions.append(lp)
+        return predictions
+
+    # Combine all our predictions into a single ensemble prediction.
+    # As a placeholder, we just take the first prediction.
+    # TODO: implement a real combination algorithm.
+    def compute_and_save_ensemble(self, trip, predictions):
+        il = ecwl.Labelprediction()
+        il.trip_id = trip.get_id()
+        il.start_ts = trip["data"]["start_ts"]
+        il.end_ts = trip["data"]["end_ts"]
+        (il.algorithm_id, il.prediction) = ensemble(trip, predictions)
+        self.ts.insert_data(self.user_id, "analysis/inferred_labels", il)
+        return il
diff --git a/emission/analysis/modelling/trip_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py
@@ -35,6 +35,57 @@ def from_str(cls, str):
             msg = f"{str} is not a known ModelStorage, must be one of {names}"
             raise KeyError(msg)
 
+def load_model_all_users(model_type: eamum.ModelType, model_storage: ModelStorage) -> Optional[Dict]:
+    """load a user label model from a model storage location
+
+    :param user_id: the user to request a model for
+    :param model_type: expected type of model stored
+    :param model_storage: storage format 
+    :return: the model representation as a Python Dict or None
+    :raises: TypeError if loaded model has different type than expected type
+             KeyError if the ModelType is not known
+    """
+    if model_storage == ModelStorage.DOCUMENT_DATABASE:
+
+        # retrieve stored model with timestamp that matches/exceeds the most
+        # recent PipelineState.TRIP_MODEL entry        
+        ms = esma.ModelStorage.get_model_storage(0)
+        latest_model_entry = ms.get_current_model(key=esda.REPLACE_MODEL_STORE_KEY)
+
+        if latest_model_entry is None:
+            logging.debug(f'no {model_type.name} model found')
+            return None
+
+        write_ts = latest_model_entry['metadata']['write_ts']
+        logging.debug(f'retrieved latest trip model recorded at timestamp {write_ts}')
+        logging.debug(latest_model_entry)
+
+        # parse str to enum for ModelType
+        latest_model_type_str = latest_model_entry.get('data', {}).get('model_type')
+        if latest_model_type_str is None:
+            raise TypeError('stored model does not have a model type')
+        latest_model_type = eamum.ModelType.from_str(latest_model_type_str)
+
+        # validate and return
+        if latest_model_entry is None:
+            return None
+        elif latest_model_type != model_type:
+            msg = (
+                f"loading model has model type '{latest_model_type.name}' " 
+                f"but was expected to have model type {model_type.name}"
+            )
+            raise TypeError(msg)
+        else:
+            return latest_model_entry['data']['model']
+
+    else:
+        storage_types_str = ",".join(ModelStorage.names())
+        msg = (
+            f"unknown model storage type {model_storage}, must be one of "
+            f"{{{storage_types_str}}}"
+        )
+        raise TypeError(msg)
+
 def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage) -> Optional[Dict]:
     """load a user label model from a model storage location
 

diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py
@@ -118,6 +118,27 @@ def predict_labels_with_n(
         predictions, n = model.predict(trip)
         return predictions, n
 
+def predict_labels_with_gbdt(
+    trip: ecwc.Confirmedtrip,
+    model_type = eamumt.ModelType.GRADIENT_BOOSTED_DECISION_TREE,
+    model_storage = eamums.ModelStorage.DOCUMENT_DATABASE,
+    model_config = None):
+    """
+    invoke the user label prediction model to predict labels for a trip.
+
+    :param trip: the trip to predict labels for
+    :param model_type: type of prediction model to run
+    :param model_storage: location to read/write models
+    :param model_config: optional configuration for model, for debugging purposes
+    :return: a list of predictions
+    """
+    user_id = trip['user_id']
+    model = _load_stored_trip_model_all_users(model_type, model_storage, model_config)
+    if model is None:
+        return [], -1
+    else:
+        predictions, n = model.predict(trip)
+        return predictions, n
 
 def _get_training_data(user_id: UUID, time_query: Optional[estt.TimeQuery]):
     """
@@ -159,6 +180,25 @@ def _load_stored_trip_model(
         model.from_dict(model_dict)
         return model
 
+def _load_stored_trip_model_all_users(
+    model_type: eamumt.ModelType, 
+    model_storage: eamums.ModelStorage,
+    model_config = None) -> Optional[eamuu.TripModel]:
+    """helper to build a user label prediction model class with the 
+    contents of a stored model shared across all users.
+
+    :param model_type: TripModel type configured for this OpenPATH server
+    :param model_storage: storage type
+    :param model_config: optional configuration for model, for debugging purposes
+    :return: model, or None if no model is stored for this user
+    """
+    model_dict = eamums.load_model_all_users(model_type, model_storage)
+    if model_dict is None:
+        return None
+    else:    
+        model = model_type.build(model_config)
+        model.from_dict(model_dict)
+        return model
 
 def _latest_timestamp(trips: List[ecwc.Confirmedtrip]) -> float:
     """extract the latest timestamp observed from a list of trips

diff --git a/emission/core/wrapper/labelprediction.py b/emission/core/wrapper/labelprediction.py
@@ -19,6 +19,7 @@ class AlgorithmTypes(enum.Enum):
     TWO_STAGE_BIN_CLUSTER = 5
     PLACEHOLDER_PREDICTOR_DEMO = 6
     CONFIDENCE_DISCOUNTED_CLUSTER = 7
+    GRADIENT_BOOSTED_DECISION_TREE = 8
 
 
 class Labelprediction(ecwb.WrapperBase):

diff --git a/emission/storage/decorations/analysis_timeseries_queries.py b/emission/storage/decorations/analysis_timeseries_queries.py
@@ -38,6 +38,7 @@
 METRICS_DAILY_MEAN_MEDIAN_SPEED = "metrics/daily_mean_median_speed"
 INFERRED_LABELS_KEY = "inference/labels"
 TRIP_MODEL_STORE_KEY = "inference/trip_model"
+REPLACE_MODEL_STORE_KEY = "inference/replace_model"
 
 # General methods