Azure · ankushbhatia2 · Aug 2, 2024 · Jul 22, 2024 · Jul 26, 2024 · Jul 29, 2024
@@ -488,9 +488,9 @@ outputs:
     description: output folder containing _best_ finetuned model in mlflow format.
     mode: rw_mount
 
-  # evaluation_result:
-  #   type: uri_folder
-  #   description: Test Data Evaluation Results
+  evaluation_result:
+    type: uri_folder
+    description: Test Data Evaluation Results
 
 jobs:
   ft_nlp_common_validation:
@@ -627,34 +627,30 @@ jobs:
       # converted_model: '${{parent.jobs.chat_completion_finetune.outputs.mlflow_model_folder}}'
     outputs:
       mlflow_model_folder: '${{parent.outputs.mlflow_model_folder}}'
-  # model_prediction:
-  #   type: command
-  #   component: azureml:model_prediction:0.0.21
-  #   compute: '${{parent.inputs.compute_model_evaluation}}'
-  #   resources:
-  #     instance_type: '${{parent.inputs.instance_type_model_evaluation}}'
-  #   inputs:
-  #     task: chat-completion
-  #     test_data: '${{parent.jobs.chat_completion_datapreprocess.outputs.output_dir}}'
-  #     label_column_name: ''
-  #     input_column_names: "''"
-  #     batch_size: '${{parent.inputs.per_device_train_batch_size}}'
-  #     device: auto
-  #     mlflow_model: '${{parent.jobs.chat_completion_model_converter.outputs.mlflow_model_folder}}'
-  # compute_metrics:
-  #   type: command
-  #   component: azureml:compute_metrics:0.0.21
-  #   compute: '${{parent.inputs.compute_model_evaluation}}'
-  #   resources:
-  #     instance_type: '${{parent.inputs.instance_type_model_evaluation}}'
-  #   inputs:
-  #     task: chat-completion
-  #     ground_truth: '${{parent.jobs.model_prediction.outputs.ground_truth}}'
-  #     ground_truth_column_name: '${{parent.inputs.answers_key}}'
-  #     prediction: '${{parent.jobs.model_prediction.outputs.predictions}}'
-  #     prediction_column_name: predictions
-  #     prediction_probabilities: '${{parent.jobs.model_prediction.outputs.prediction_probabilities}}'
-  #     evaluation_config: '${{parent.inputs.evaluation_config}}'
-  #     evaluation_config_params: '${{parent.inputs.evaluation_config_params}}'
-  #   outputs:
-  #     evaluation_result: '${{parent.outputs.evaluation_result}}'
+  model_prediction:
+    type: command
+    component: azureml:model_prediction_with_container:0.0.2
+    compute: '${{parent.inputs.compute_model_evaluation}}'
+    resources:
+      instance_type: '${{parent.inputs.instance_type_model_evaluation}}'
+    inputs:
+      task: chat-completion
+      test_data: '${{parent.jobs.chat_completion_datapreprocess.outputs.output_dir}}'
+      label_column_name: messages
+      mlflow_model: '${{parent.jobs.chat_completion_model_converter.outputs.mlflow_model_folder}}'
+      evaluation_config_params: '${{parent.inputs.evaluation_config_params}}'
+  compute_metrics:
+    type: command
+    component: azureml:compute_metrics:0.0.31
+    compute: '${{parent.inputs.compute_model_evaluation}}'
+    resources:
+      instance_type: '${{parent.inputs.instance_type_model_evaluation}}'
+    inputs:
+      task: chat-completion
+      ground_truth: '${{parent.jobs.model_prediction.outputs.ground_truth}}'
+      prediction: '${{parent.jobs.model_prediction.outputs.predictions}}'
+      prediction_probabilities: '${{parent.jobs.model_prediction.outputs.prediction_probabilities}}'
+      evaluation_config: '${{parent.inputs.evaluation_config}}'
+      evaluation_config_params: '${{parent.inputs.evaluation_config_params}}'
+    outputs:
+      evaluation_result: '${{parent.outputs.evaluation_result}}'
@@ -667,7 +667,7 @@ jobs:
       mlflow_model_folder: '${{parent.outputs.mlflow_model_folder}}'
   model_prediction:
     type: command
-    component: azureml:model_prediction:0.0.30
+    component: azureml:model_prediction_with_container:0.0.2
     compute: '${{parent.inputs.compute_model_evaluation}}'
     resources:
       instance_type: '${{parent.inputs.instance_type_model_evaluation}}'
@@ -676,10 +676,7 @@ jobs:
       test_data: '${{parent.jobs.text_generation_datapreprocess.outputs.output_dir}}'
       label_column_name: '${{parent.inputs.ground_truth_key}}'
       input_column_names: '${{parent.inputs.text_key}}'
-      batch_size: '${{parent.inputs.per_device_train_batch_size}}'
-      device: auto
-      mlflow_model: '${{parent.jobs.text_generation_model_converter.outputs.mlflow_model_folder}}'
-      evaluation_config: '${{parent.inputs.evaluation_config}}'
+      mlflow_model: '${{parent.jobs.ft_nlp_model_converter.outputs.mlflow_model_folder}}'
       evaluation_config_params: '${{parent.inputs.evaluation_config_params}}'
   compute_metrics:
     type: command

@@ -3,7 +3,7 @@ name: compute_metrics
 display_name: Compute Metrics
 description: Calculate model performance metrics, given ground truth and prediction data.
 
-version: 0.0.30
+version: 0.0.31
 type: command
 tags:
   type: evaluation

@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/pipelineComponent.schema.json
 name: model_prediction_with_container 
-version: 0.0.1
+version: 0.0.2
 type: command
 display_name: Distributed Model Prediction
 description: "Optimized Distributed inference component for LLMs."
@@ -69,7 +69,7 @@ outputs:
 
 
 code: ../../src_distributed
-environment: azureml://registries/azureml/environments/foundation-model-inference/versions/42
+environment: azureml://registries/azureml/environments/foundation-model-inference/versions/46
 command: >-
   python download_extra_dependency.py
   --mlflow-model '${{inputs.mlflow_model}}' ;

@@ -3,7 +3,7 @@ name: model_prediction
 display_name: Model Prediction
 description: Generate predictions on a given mlflow model for supported tasks.
 
-version: 0.0.30
+version: 0.0.31
 type: command
 tags:
   type: evaluation

@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/pipelineComponent.schema.json
 name: model_evaluation_pipeline
-version: 0.0.30
+version: 0.0.31
 type: pipeline
 display_name: Model Evaluation Pipeline
 description: Pipeline component for model evaluation for supported tasks. \
@@ -87,7 +87,7 @@ outputs:
 jobs:
   validation_trigger_model_evaluation:
     type: command
-    component: azureml:validation_trigger_model_evaluation:0.0.30
+    component: azureml:validation_trigger_model_evaluation:0.0.31
     compute: '${{parent.inputs.compute_name}}'
     resources:
       instance_type: '${{parent.inputs.instance_type}}'
@@ -111,7 +111,7 @@ jobs:
 
   model_prediction:
     type: command
-    component: azureml:model_prediction:0.0.30
+    component: azureml:model_prediction:0.0.31
     compute: '${{parent.inputs.compute_name}}'
     resources:
       instance_type: '${{parent.inputs.instance_type}}'
@@ -128,7 +128,7 @@ jobs:
 
   compute_metrics:
     type: command
-    component: azureml:compute_metrics:0.0.30
+    component: azureml:compute_metrics:0.0.31
     compute: '${{parent.inputs.compute_name}}'
     resources:
       instance_type: '${{parent.inputs.instance_type}}'

@@ -3,7 +3,7 @@ name: validation_trigger_model_evaluation
 display_name: Validation Trigger Model Evaluation
 description: Component for enabling validation of model evaluation pipeline.
 
-version: 0.0.30
+version: 0.0.31
 type: command
 tags:
   type: evaluation

@@ -617,16 +617,45 @@ def evaluate(self, y_test, y_pred, **kwargs):
         """
         #  dataframe with 2 columns predictions and predictions appended to the conversation
         if len(y_pred.columns) > 1:
-            y_pred_formatted = [
-                list(item[ChatCompletionConstants.OUTPUT_FULL_CONVERSATION][0].values())[0]
-                for idx, item in y_pred.iterrows()
-            ]
+            logger.info("Found more than 1 col. Trying to fetch conversation.")
+
+            def check_item(row_item: pd.Series):
+                """Convert input data to correct format for metrics package.
+
+                Args:
+                    row_item (pd.Series): Single row input from Dataframe
+                """
+                item = row_item.get(ChatCompletionConstants.OUTPUT_FULL_CONVERSATION, None)
+                if item is None:
+                    return row_item
+                if isinstance(item, list) and isinstance(item[0], dict):
+                    if item[0].get("role", False) and item[0].get("content", False):
+                        return item
+                    else:
+                        if item[0].get("0", False):
+                            return item["0"]
+                return item
+
+            y_pred_formatted = y_pred.apply(check_item, axis=1).tolist()
         # dataframe wih just predictions appended to conversations
         else:
-            y_pred_formatted = y_pred.values.tolist()[0]
-        #  if ground truth is passed
+            y_pred_formatted = y_pred.values.tolist()
+        # if ground truth is passed
         if y_test is not None and len(y_test) > 0:
-            y_test = y_test.iloc[:, 0].apply(lambda x: [x]).tolist()
+
+            def check_y_test(row_item: pd.Series):
+                """Convert ground truth into correct format for metrics package.
+
+                Args:
+                    row_item (pd.Series): Single row input from Dataframe
+                """
+                item = row_item.get(y_test.columns[0])
+                if isinstance(item, str) or isinstance(item, dict):
+                    return [item]
+                if isinstance(item, list):
+                    return item
+
+            y_test = y_test.apply(check_y_test, axis=1).tolist()
             metrics = compute_metrics(task_type=constants.Tasks.CHAT_COMPLETION, y_pred=y_pred_formatted,
                                       y_test=y_test, **self.metrics_config)
         else:

@@ -24,7 +24,12 @@ def ner_predictor_for_transformers(X_test, params=None):
         Returns:
             _type_: _description_
         """
-        transformers_class._override_model_config(params)
+        try:
+            transformers_class._override_model_config(params)
+        except AttributeError:
+            logger.info("Using newer version of mlflow.transformers._TransformersWrapper\
+                        model config override API")
+            transformers_class._merge_model_config_with_params(transformers_class.model_config, params)
         from azureml.evaluate.mlflow.hftransformers._task_based_predictors import NERPredictor
         predictor = NERPredictor(task_type="token-classification", model=transformers_class.pipeline.model,
                                  tokenizer=transformers_class.pipeline.tokenizer,

@@ -17,9 +17,9 @@
 import glob
 
 from mltable import load
-
-
-from logging_utilities import get_logger
+from exceptions import DataLoaderException
+from error_definitions import BadLabelColumnData
+from logging_utilities import get_logger, get_azureml_exception, log_traceback
 
 logger = get_logger(name=__name__)
 
@@ -180,6 +180,28 @@ def read_multiple_files(path):
     return iter([data])
 
 
+def prepare_chat_data_from_ft_pipeline(data: pd.DataFrame):
+    """Prepare Chat completion data from FT pipeline.
+
+    Args:
+        data: pd.DataFrame
+    """
+    try:
+        messages_col = data[local_constants.LLM_FT_CHAT_COMPLETION_KEY]
+    except Exception as e:
+        logger.error(f"'{local_constants.LLM_FT_CHAT_COMPLETION_KEY}' not found in FT test dataset.")
+        exception = get_azureml_exception(DataLoaderException, BadLabelColumnData, e, error=repr(e))
+        log_traceback(exception, logger)
+        raise exception
+    X_test, y_test = {local_constants.LLM_FT_CHAT_COMPLETION_KEY:[]}, []
+    for message in messages_col.to_list():
+        X_test[local_constants.LLM_FT_CHAT_COMPLETION_KEY].append(message[:-1])
+        y_test.append(message[-1]["content"])
+    X_test = pd.DataFrame(X_test)
+    y_test = pd.Series(y_test)
+    return X_test, y_test.values
+
+
 def prepare_data(data, task, label_column_name=None, _has_multiple_output=False, extra_y_test_cols=None):
     """Prepare data.
 

@@ -4,7 +4,7 @@
 """File to create AzureML Based Exceptions for Model Evaluation."""
 
 from azureml.exceptions import AzureMLException
-from constants import ExceptionLiterals
+from local_constants import ExceptionLiterals
 
 
 class ModelEvaluationException(AzureMLException):

@@ -12,6 +12,7 @@
 MLTABLE_FILE_NAME = "MLTable"
 LLM_FT_PREPROCESS_FILENAME = "preprocess_args.json"
 LLM_FT_TEST_DATA_KEY = "raw_test_data_fname"
+LLM_FT_CHAT_COMPLETION_KEY = "messages"
 
 # default values
 class ModelPath:
@@ -194,4 +195,10 @@ class TASK:
 FILTER_MODEL_PREDICTION_PARAMS = [
     "tokenizer_config",
     "generator_config"
-]
+]
+
+class ChatCompletionConstants:
+    """Chat completion constants."""
+
+    OUTPUT = "predictions"
+    OUTPUT_FULL_CONVERSATION = "prediction_appended"