Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable input column, label column for vision tasks #3016

Merged
merged 14 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: model_prediction
display_name: Model Prediction
description: Generate predictions on a given mlflow model for supported tasks.

version: 0.0.28
version: 0.0.29
type: command
tags:
type: evaluation
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineComponent.schema.json
name: model_evaluation_pipeline
version: 0.0.28
version: 0.0.29
type: pipeline
display_name: Model Evaluation Pipeline
description: Pipeline component for model evaluation for supported tasks. \
Expand Down
4 changes: 3 additions & 1 deletion assets/training/model_evaluation/src/evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ def load_data(self, test_data, label_column_name, input_column_names=None):
all_cols = list(input_column_names)
if label_column_name is not None:
all_cols += [label_column_name]
data, _ = read_model_prediction_data(test_data, self.task, self.batch_size)
data, _ = read_model_prediction_data(
test_data, input_column_names, label_column_name, self.task, self.batch_size
)
data = map(prepare_data, data, repeat(self.task), repeat(all_cols), repeat(label_column_name),
repeat(False), repeat(list()), repeat(self.batch_size))
return data # X_test, y_test
Expand Down
3 changes: 2 additions & 1 deletion assets/training/model_evaluation/src/image_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
class SettingLiterals:
"""Setting literals for classification dataset."""

LABEL_COLUMN_NAME = "label_column_name"
IMAGE_URL = "image_url"
LABEL = "label"
MASKS_REQUIRED = "masks_required"
USE_BG_LABEL = "use_bg_label"
IGNORE_DATA_ERRORS = "ignore_data_errors"
Expand Down
69 changes: 37 additions & 32 deletions assets/training/model_evaluation/src/image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@

from PIL import Image
from torch import Tensor
from typing import cast, Dict, Tuple
from typing import cast, Dict, List, Tuple

import constants

from image_constants import SettingLiterals, ImageDataFrameParams, ODISLiterals
from image_constants import SettingLiterals, ODISLiterals
from logging_utilities import get_logger

from azureml.automl.core.shared.constants import MLTableLiterals, MLTableDataLabel
Expand Down Expand Up @@ -150,13 +150,17 @@ def read_image(image_path):

def get_classification_dataset(
testing_mltable: str,
input_column_names: List[str],
label_column_name: str,
settings: Dict = {},
multi_label: bool = False,
) -> AmlDatasetWrapper:
) -> pd.DataFrame:
"""
Return training and validation dataset for classification task from mltable.

:param testing_mltable: The training mltable path
:param test_mltable: The path to the prediction input mltable
:param input_column_names: The column names of the model inputs
:param label_column_name: The column name of the label
:param settings: Settings dictionary
:param multi_label: True if multi label classification, False otherwise
:return: Data Frame with test image paths and labels
Expand All @@ -177,7 +181,6 @@ def get_classification_dataset(
workspace=ws,
)

label_column_name = settings.get(SettingLiterals.LABEL_COLUMN_NAME, None)
test_dataset_wrapper = AmlDatasetWrapper(
test_tabular_ds,
multilabel=multi_label,
Expand All @@ -189,38 +192,34 @@ def get_classification_dataset(
# labels: {test_dataset_wrapper.num_classes}"
)

df = pd.DataFrame(columns=[ImageDataFrameParams.IMAGE_COLUMN_NAME, ImageDataFrameParams.LABEL_COLUMN_NAME])
df = pd.DataFrame(columns=input_column_names + [label_column_name])
for index in range(len(test_dataset_wrapper)):
image_path = test_dataset_wrapper.get_image_full_path(index)
if is_valid_image(image_path):
# sending image_paths instead of base64 encoded string as oss flavor doesnt take bytes as input.
df = df.append({
ImageDataFrameParams.IMAGE_COLUMN_NAME: image_path,
ImageDataFrameParams.LABEL_COLUMN_NAME: test_dataset_wrapper.label_at_index(index)
input_column_names[0]: image_path,
label_column_name: test_dataset_wrapper.label_at_index(index)
}, ignore_index=True)

return df


def get_object_detection_dataset(
test_mltable: str,
input_column_names: List[str],
label_column_name: str,
settings: Dict = {},
masks_required: bool = False,
) -> Tuple[RuntimeDetectionDatasetAdapter, RuntimeDetectionDatasetAdapter]:
) -> pd.DataFrame:
"""Return training and validation dataset for object detection and instance segmentation task from mltable.

:param training_mltable: The training mltable path
:type training_mltable: str
:param object_detection_dataset: The dataset adapter class name to be used for creating dataset objects.
:type object_detection_dataset: RuntimeDetectionDatasetAdapter
:param test_mltable: The path to the prediction input mltable
:param input_column_names: The column names of the model inputs
:param label_column_name: The column name of the label
:param settings: Settings dictionary
:type settings: Dict
:param validation_mltable: The validation mltable path
:type validation_mltable: str
:param masks_required: mask required or not for segmentation. Optional, default False
:type masks_required: bool
:return: Training dataset, validation dataset
:rtype: Tuple[RuntimeDetectionDatasetAdapter, RuntimeDetectionDatasetAdapter]
:return: Data Frame with test image paths and labels
"""
mltable = _combine_mltables(test_mltable, test_mltable)

Expand Down Expand Up @@ -253,9 +252,7 @@ def get_object_detection_dataset(
f"# test images: {len(test_dataset)}, # labels: {test_dataset.num_classes}"
)
test_dataset_wrapper = RuntimeDetectionDatasetAdapter(test_dataset)
df = pd.DataFrame(columns=[ImageDataFrameParams.IMAGE_COLUMN_NAME,
ImageDataFrameParams.LABEL_COLUMN_NAME,
ImageDataFrameParams.IMAGE_META_INFO])
df = pd.DataFrame(columns=input_column_names + [label_column_name])

counter = 0
for index in range(len(test_dataset_wrapper)):
Expand All @@ -265,36 +262,44 @@ def get_object_detection_dataset(
if is_valid_image(image_path):
counter += 1
df = df.append({
ImageDataFrameParams.IMAGE_COLUMN_NAME: base64.encodebytes(read_image(image_path)).decode("utf-8"),
ImageDataFrameParams.LABEL_COLUMN_NAME: label,
ImageDataFrameParams.IMAGE_META_INFO: image_meta_info,
ImageDataFrameParams.TEXT_PROMPT: ". ".join(test_dataset.classes)
input_column_names[0]: base64.encodebytes(read_image(image_path)).decode("utf-8"),
input_column_names[1]: image_meta_info,
input_column_names[2]: ". ".join(test_dataset.classes),
label_column_name: label,
}, ignore_index=True)

logger.info(f"Total number of valid images: {counter}")
return df


def get_image_dataset(task_type, test_mltable, settings={}):
"""
Return test dataset for image tasks from mltable.
def get_image_dataset(task_type, test_mltable, input_column_names, label_column_name, settings={}):
"""Return test dataset for image tasks from mltable.

:param testing_mltable: The training mltable path
Important details: for vision datasets, the MLTable must have columns "image_url" and "label". For some tasks, the
output Pandas dataframe may have other column names to respect the model input expectations.

:param task_type: The type of the prediction task
:param test_mltable: The path to the prediction input mltable
:param input_column_names: The column names of the model inputs
:param label_column_name: The column name of the label
:param settings: Settings dictionary
:param multi_label: True if multi label classification, False otherwise
:return: Data Frame with test image paths and labels
:return: Data Frame with image paths and labels
"""
if task_type in [constants.TASK.IMAGE_CLASSIFICATION, constants.TASK.IMAGE_CLASSIFICATION_MULTILABEL]:
multi_label = True if task_type == constants.TASK.IMAGE_CLASSIFICATION_MULTILABEL else False
return get_classification_dataset(
testing_mltable=test_mltable,
input_column_names=input_column_names,
label_column_name=label_column_name,
settings=settings,
multi_label=multi_label,
)
elif task_type in [constants.TASK.IMAGE_OBJECT_DETECTION, constants.TASK.IMAGE_INSTANCE_SEGMENTATION]:
masks_required = True if task_type == constants.TASK.IMAGE_INSTANCE_SEGMENTATION else False
return get_object_detection_dataset(
test_mltable=test_mltable,
input_column_names=input_column_names,
label_column_name=label_column_name,
settings=settings,
masks_required=masks_required,
)
Expand Down
4 changes: 3 additions & 1 deletion assets/training/model_evaluation/src/model_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ def load_data(self, test_data):
if self.extra_y_test_cols is not None:
all_cols += self.extra_y_test_cols

data, file_ext = read_model_prediction_data(test_data, self.task, self.batch_size)
data, file_ext = read_model_prediction_data(
test_data, self.input_column_names, self.label_column_name, self.task, self.batch_size
)
data = map(prepare_data, data, repeat(self.task), repeat(all_cols), repeat(self.label_column_name),
repeat(False), repeat(self.extra_y_test_cols), repeat(self.batch_size), repeat(file_ext))
return data # X_test, y_test
Expand Down
51 changes: 41 additions & 10 deletions assets/training/model_evaluation/src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,11 +634,15 @@ def _get_file_extension(file_path):
return os.path.splitext(file_path)[1].lower()


def read_model_prediction_data(file_path, task=None, batch_size=None, nrows=None):
def read_model_prediction_data(
file_path, input_column_names, label_column_name, task=None, batch_size=None, nrows=None
):
"""Util function for reading test data for model prediction.

Args:
file_path (_type_): _description_
input_column_names (List[str])): Name of input columns.
label_column_name (str): Name of label column.
task (_type_): _description_
batch_size (_type_): _description_
nrows (_type_): _description_
Expand Down Expand Up @@ -674,7 +678,10 @@ def read_model_prediction_data(file_path, task=None, batch_size=None, nrows=None

# Read the dataset from the MLTable.
from image_dataset import get_image_dataset
df = get_image_dataset(task_type=task, test_mltable=file_path)
df = get_image_dataset(
task_type=task, test_mltable=file_path,
input_column_names=input_column_names, label_column_name=label_column_name
)
data = iter([df])
file_ext = SupportedFileExtensions.IMAGE

Expand Down Expand Up @@ -1097,16 +1104,38 @@ def parse_input_ground_truth_col(col_name):
return col_name, extra_cols


def get_column_names(args, data):
"""Get Column names from test data."""
def get_sample_data_and_column_names(args):
"""Get sample data and column names based on the specified arguments."""
data_path = args[ArgumentLiterals.DATA]
task = args[ArgumentLiterals.TASK]
if task in constants.IMAGE_TASKS:
input_column_names = [ImageDataFrameParams.IMAGE_COLUMN_NAME]
label_column_name = ImageDataFrameParams.LABEL_COLUMN_NAME
if args[ArgumentLiterals.INPUT_COLUMN_NAMES]:
input_column_names = args[ArgumentLiterals.INPUT_COLUMN_NAMES]
else:
input_column_names = [ImageDataFrameParams.IMAGE_COLUMN_NAME]
if task in [constants.TASK.IMAGE_OBJECT_DETECTION, constants.TASK.IMAGE_INSTANCE_SEGMENTATION]:
input_column_names.extend([ImageDataFrameParams.IMAGE_META_INFO, ImageDataFrameParams.TEXT_PROMPT])

if args[ArgumentLiterals.LABEL_COLUMN_NAME]:
if len(args[ArgumentLiterals.LABEL_COLUMN_NAME]) != 1:
message = "Must specify only one label column for vision tasks."
exception = get_azureml_exception(
ArgumentValidationException, ArgumentParsingError, None, error=message
)
log_traceback(exception, logger)
raise exception

label_column_name = args[ArgumentLiterals.LABEL_COLUMN_NAME][0]
else:
label_column_name = ImageDataFrameParams.LABEL_COLUMN_NAME

extra_y_test_cols = None
if task in [constants.TASK.IMAGE_OBJECT_DETECTION, constants.TASK.IMAGE_INSTANCE_SEGMENTATION]:
input_column_names.extend([ImageDataFrameParams.IMAGE_META_INFO, ImageDataFrameParams.TEXT_PROMPT])

sample_data, _ = read_model_prediction_data(data_path, task, input_column_names, label_column_name)

else:
sample_data, _ = read_model_prediction_data(data_path, task, [], "", nrows=1)

# If input_column_names are not sent as argument we are retaining all columns
label_column_name = args[ArgumentLiterals.LABEL_COLUMN_NAME]
if label_column_name is None:
Expand All @@ -1120,14 +1149,16 @@ def get_column_names(args, data):

input_column_names = args[ArgumentLiterals.INPUT_COLUMN_NAMES]
if input_column_names is None or len(input_column_names) == 0:
input_column_names = list(data.columns)
input_column_names = list(sample_data.columns)
if label_column_name is not None and label_column_name in input_column_names:
input_column_names.remove(label_column_name)
if extra_y_test_cols is not None:
for col in extra_y_test_cols:
if col in input_column_names:
input_column_names.remove(col)
return input_column_names, label_column_name, extra_y_test_cols

sample_data = list(sample_data)[0]
return sample_data, input_column_names, label_column_name, extra_y_test_cols


def openai_init(llm_config, **openai_params):
Expand Down
19 changes: 12 additions & 7 deletions assets/training/model_evaluation/src/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
ArgumentValidationException,
)
from constants import ALL_TASKS, TASK, ArgumentLiterals
from image_constants import SettingLiterals
from logging_utilities import get_logger, log_traceback, get_azureml_exception
from utils import assert_and_raise, read_config, read_config_str, read_model_prediction_data, get_column_names
from utils import assert_and_raise, read_config, read_config_str, get_sample_data_and_column_names
from error_definitions import (
InvalidTaskType,
InvalidModel,
Expand Down Expand Up @@ -228,15 +229,19 @@ def validate_and_get_columns(args):
Args:
args (_type_): _description_
"""
logger.info("Reading top row in data for validation.")
data, _ = read_model_prediction_data(args[ArgumentLiterals.DATA], args[ArgumentLiterals.TASK], nrows=1)
data = list(data)[0]
input_column_names, label_column_name, extra_y_test_cols = get_column_names(args, data)

validate_input_column_names(input_column_names, data)
logger.info("Reading top row in data for column name extraction and validation.")
data, input_column_names, label_column_name, extra_y_test_cols = get_sample_data_and_column_names(args)

task = args[ArgumentLiterals.TASK]
config = args[ArgumentLiterals.CONFIG]

if task in constants.IMAGE_TASKS:
# Vision datasets must have an image_url and a label column. The input columns for model prediction will be
# constructed from these two (pass through in most of the cases).
validate_input_column_names([SettingLiterals.IMAGE_URL, SettingLiterals.LABEL], data)
else:
validate_input_column_names(input_column_names, data)

if task == TASK.TEXT_GENERATION:
if config.get(constants.TextGenerationColumns.SUBTASKKEY, "") == constants.SubTask.CODEGENERATION:
# Ensure that user always has "," in label_col_name
Expand Down
Loading