From 076221cc4ed39223da0059ca85e9725199eba73b Mon Sep 17 00:00:00 2001 From: faradawn Date: Thu, 30 Nov 2023 10:41:16 -0600 Subject: [PATCH 1/7] add model and pytest file, passed all tests Signed-off-by: faradawn --- opensearch_py_ml/ml_models/__init__.py | 3 +- .../ml_models/question_answering_model.py | 542 ++++++++++++++++++ .../test_question_answering_pytest.py | 261 +++++++++ 3 files changed, 805 insertions(+), 1 deletion(-) create mode 100644 opensearch_py_ml/ml_models/question_answering_model.py create mode 100644 tests/ml_models/test_question_answering_pytest.py diff --git a/opensearch_py_ml/ml_models/__init__.py b/opensearch_py_ml/ml_models/__init__.py index 3ec96ebd..2dd7b430 100644 --- a/opensearch_py_ml/ml_models/__init__.py +++ b/opensearch_py_ml/ml_models/__init__.py @@ -7,5 +7,6 @@ from .metrics_correlation.mcorr import MCorr from .sentencetransformermodel import SentenceTransformerModel +from .question_answering_model import QuestionAnsweringModel -__all__ = ["SentenceTransformerModel", "MCorr"] +__all__ = ["SentenceTransformerModel", "SentenceTransformerModel", "MCorr"] diff --git a/opensearch_py_ml/ml_models/question_answering_model.py b/opensearch_py_ml/ml_models/question_answering_model.py new file mode 100644 index 00000000..fcafde68 --- /dev/null +++ b/opensearch_py_ml/ml_models/question_answering_model.py @@ -0,0 +1,542 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +import json +import os +import pickle +import platform +import random +import re +import shutil +import subprocess +import time +from pathlib import Path +from typing import List +from zipfile import ZipFile + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import requests +import torch +import yaml +from accelerate import Accelerator, notebook_launcher +from mdutils.fileutils import MarkDownFile +# from sentence_transformers import SentenceTransformer +# from sentence_transformers.models import Normalize, Pooling, Transformer +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoTokenizer, AutoModelForQuestionAnswering +import transformers + + +from opensearch_py_ml.ml_commons.ml_common_utils import ( + _generate_model_content_hash_value, +) + +LICENSE_URL = "https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE" + + +class QuestionAnsweringModel: + """ + Class for tracing the QuestionAnswering model. + """ + # distilbert-base-cased-distilled-squad + DEFAULT_MODEL_ID = "distilbert-base-cased-distilled-squad" + SYNTHETIC_QUERY_FOLDER = "synthetic_queries" + + def __init__( + self, + model_id: str = DEFAULT_MODEL_ID, + folder_path: str = None, + overwrite: bool = False, + ) -> None: + """ + Initiate a question answering model class object. The model id will be used to download + pretrained model from the hugging-face and served as the default name for model files, and the folder_path + will be the default location to store files generated in the following functions + + :param model_id: Optional, the huggingface mode id to download the model, + default model id: 'distilbert-base-cased-distilled-squad' + :type model_id: string + :param folder_path: Optional, the path of the folder to save output files, such as queries, pre-trained model, + after-trained custom model and configuration files. if None, default as "/model_files/" under the current + work directory + :type folder_path: string + :param overwrite: Optional, choose to overwrite the folder at folder path. Default as false. When training + different question answering models, it's recommended to give designated folder path every time. + Users can choose to overwrite = True to overwrite previous runs + :type overwrite: bool + :return: no return value expected + :rtype: None + """ + default_folder_path = os.path.join( + os.getcwd(), "question_answering_model_files" + ) + + if folder_path is None: + self.folder_path = default_folder_path + else: + self.folder_path = folder_path + + # Check if self.folder_path exists + if os.path.exists(self.folder_path) and not overwrite: + print( + "To prevent overwriting, please enter a different folder path or delete the folder or enable " + "overwrite = True " + ) + raise Exception( + str("The default folder path already exists at : " + self.folder_path) + ) + + self.model_id = model_id + self.torch_script_zip_file_path = None + self.onnx_zip_file_path = None + + + def save_as_pt( + self, + sentences: [str] = ["today is sunny"], + model_id="distilbert-base-cased-distilled-squad", + model_name: str = None, + save_json_folder_path: str = None, + model_output_path: str = None, + zip_file_name: str = None, + add_apache_license: bool = False, + ) -> str: + """ + Download the model directly from huggingface, convert model to torch script format, + zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster + + :param sentences: + Required, for example sentences = ['today is sunny'] + :type sentences: List of string [str] + :param model_id: + question answering model id to download model from question answerings. + default model_id = "distilbert-base-cased-distilled-squad" + :type model_id: string + :param model_name: + Optional, model name to name the model file, e.g, "sample_model.pt". If None, default takes the + model_id and add the extension with ".pt" + :type model_name: string + :param save_json_folder_path: + Optional, path to save model json file, e.g, "home/save_pre_trained_model_json/"). If None, default as + default_folder_path from the constructor + :type save_json_folder_path: string + :param model_output_path: + Optional, path to save traced model zip file. If None, default as + default_folder_path from the constructor + :type model_output_path: string + :param zip_file_name: + Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id + and add the extension with ".zip" + :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string + :return: model zip file path. The file path where the zip file is being saved + :rtype: string + """ + + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForQuestionAnswering.from_pretrained(model_id) + + if model_name is None: + model_name = str(model_id.split("/")[-1] + ".pt") + + model_path = os.path.join(self.folder_path, model_name) + + if save_json_folder_path is None: + save_json_folder_path = self.folder_path + + if model_output_path is None: + model_output_path = self.folder_path + + if zip_file_name is None: + zip_file_name = str(model_id.split("/")[-1] + ".zip") + zip_file_path = os.path.join(model_output_path, zip_file_name) + + # handle when model_max_length is unproperly defined in model's tokenizer (e.g. "intfloat/e5-small-v2") + # MODEL_MAX_SEQ_LENGTH = 512 + # if tokenizer.model_max_length > model.get_max_seq_length(): + # tokenizer.model_max_length = model.get_max_seq_length() + # print( + # f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {tokenizer.model_max_length}" + # ) + + # save tokenizer.json in save_json_folder_name + # max_position_embeddings + + # AutoTokenizer will save tokenizer.json in save_json_folder_name + # DistilBertTokenizer will save it in cache: /Users/faradawn/.cache/huggingface/hub/models/... + tokenizer.save_pretrained(save_json_folder_path) + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + # Open the tokenizer.json and replace the truncation field + with open(tokenizer_file_path) as user_file: + parsed_json = json.load(user_file) + + if "truncation" not in parsed_json or parsed_json["truncation"] is None: + parsed_json["truncation"] = { + "direction": "Right", + "max_length": tokenizer.model_max_length, + "strategy": "LongestFirst", + "stride": 0, + } + + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + with open(tokenizer_file_path, "w") as file: + json.dump(parsed_json, file, indent=2) + + + # convert to pt format will need to be in cpu, + # set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format + device = torch.device("cpu") + cpu_model = model.to(device) + features = tokenizer( + sentences, return_tensors="pt", padding=True, truncation=True + ).to(device) + + compiled_model = torch.jit.trace( + cpu_model, + (features["input_ids"], features["attention_mask"]), + strict=False + ) + torch.jit.save(compiled_model, model_path) + print("Traced torchscript model is saved to ", model_path) + + # zip model file along with tokenizer.json (and license file) as output + with ZipFile(str(zip_file_path), "w") as zipObj: + zipObj.write( + model_path, + arcname=str(model_name), + ) + zipObj.write( + os.path.join(save_json_folder_path, "tokenizer.json"), + arcname="tokenizer.json", + ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + + self.torch_script_zip_file_path = zip_file_path + print("zip file is saved to ", zip_file_path, "\n") + return zip_file_path + + def save_as_onnx( + self, + model_id="distilbert-base-cased-distilled-squad", + model_name: str = None, + save_json_folder_path: str = None, + model_output_path: str = None, + zip_file_name: str = None, + add_apache_license: bool = False, + ) -> str: + """ + Download question answering model directly from huggingface, convert model to onnx format, + zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster + + :param model_id: + question answering model id to download model from question answerings. + default model_id = "distilbert-base-cased-distilled-squad" + :type model_id: string + :param model_name: + Optional, model name to name the model file, e.g, "sample_model.pt". If None, default takes the + model_id and add the extension with ".pt" + :type model_name: string + :param save_json_folder_path: + Optional, path to save model json file, e.g, "home/save_pre_trained_model_json/"). If None, default as + default_folder_path from the constructor + :type save_json_folder_path: string + :param model_output_path: + Optional, path to save traced model zip file. If None, default as + default_folder_path from the constructor + :type model_output_path: string + :param zip_file_name: + Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id + and add the extension with ".zip" + :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string + :return: model zip file path. The file path where the zip file is being saved + :rtype: string + """ + + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForQuestionAnswering.from_pretrained(model_id) + + if model_name is None: + model_name = str(model_id.split("/")[-1] + ".onnx") + + model_path = os.path.join(self.folder_path, model_name) + + if save_json_folder_path is None: + save_json_folder_path = self.folder_path + + if model_output_path is None: + model_output_path = self.folder_path + + if zip_file_name is None: + zip_file_name = str(model_id.split("/")[-1] + ".zip") + zip_file_path = os.path.join(model_output_path, zip_file_name) + + # save tokenizer.json in save_json_folder_name + tokenizer.save_pretrained(save_json_folder_path) + + # Find the tokenizer.json file path in cache: /Users/faradawn/.cache/huggingface/hub/models/... + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + + # Open the tokenizer.json and replace the truncation field + with open(tokenizer_file_path) as user_file: + parsed_json = json.load(user_file) + + if "truncation" not in parsed_json or parsed_json["truncation"] is None: + parsed_json["truncation"] = { + "direction": "Right", + "max_length": tokenizer.model_max_length, + "strategy": "LongestFirst", + "stride": 0, + } + + # Save tokenizer + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + with open(tokenizer_file_path, "w") as file: + json.dump(parsed_json, file, indent=2) + + # load config + model_kind, model_onnx_config = transformers.onnx.FeaturesManager.check_supported_model_or_raise(model, feature="question-answering") + onnx_config = model_onnx_config(model.config) + + # export + onnx_inputs, onnx_outputs = transformers.onnx.export( + preprocessor=tokenizer, + model=model, + config=onnx_config, + opset=13, + output=Path(model_path) + ) + + print("Traced onnx model is saved to ", model_path) + + # zip model file along with tokenizer.json (and license file) as output + with ZipFile(str(zip_file_path), "w") as zipObj: + zipObj.write( + model_path, + arcname=str(model_name), + ) + zipObj.write( + os.path.join(save_json_folder_path, "tokenizer.json"), + arcname="tokenizer.json", + ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + + self.onnx_zip_file_path = zip_file_path + print("zip file is saved to ", zip_file_path, "\n") + return zip_file_path + + + + def make_model_config_json( + self, + model_name: str = None, + version_number: str = 1, + model_format: str = "TORCH_SCRIPT", + model_zip_file_path: str = None, + embedding_dimension: int = None, + pooling_mode: str = None, + normalize_result: bool = None, + description: str = None, + all_config: str = None, + model_type: str = None, + verbose: bool = False, + ) -> str: + """ + Parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. + If all required fields are given by users, use the given parameters and will skip reading the config.json + + :param model_name: + Optional, The name of the model. If None, default is model id, for example, + 'distilbert-base-cased-distilled-squad' + :type model_name: string + :param model_format: + Optional, the format of the model. Default is "TORCH_SCRIPT". + :type model_format: string + :param model_zip_file_path: + Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx + depending on model_format. This zip file is used to compute model_content_size_in_bytes and + model_content_hash_value. + :type model_zip_file_path: string + :param version_number: + Optional, The version number of the model. Default is 1 + :type version_number: string + :param embedding_dimension: Optional, the embedding dimension of the model. If None, get embedding_dimension + from the pre-trained hugging-face model object. + :type embedding_dimension: int + :param pooling_mode: Optional, the pooling mode of the model. If None, get pooling_mode + from the pre-trained hugging-face model object. + :type pooling_mode: string + :param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained + hugging-face model object. + :type normalize_result: bool + :param description: Optional, the description of the model. If None, get description from the README.md + file in the model folder. + :type description: str + :param all_config: + Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained + hugging-face model + :type all_config: dict + :param model_type: + Optional, the model_type of the model. If None, parse model_type from the config file of pre-trained + hugging-face model + :type model_type: string + :param verbose: + optional, use printing more logs. Default as false + :type verbose: bool + :return: model config file path. The file path where the model config file is being saved + :rtype: string + """ + folder_path = self.folder_path + config_json_file_path = os.path.join(folder_path, "config.json") + if model_name is None: + model_name = self.model_id + + # if user input model_type/embedding_dimension/pooling_mode, it will skip this step. + model = AutoModelForQuestionAnswering.from_pretrained(self.model_id) + model.save_pretrained(self.folder_path) + + + # fill the empty fields + if ( + model_type is None + or embedding_dimension is None + or pooling_mode is None + or normalize_result is None + ): + try: + if embedding_dimension is None: + embedding_dimension = model.config.dim + if model_type is None: + model_type = "distilbert" + if pooling_mode is None: + pooling_mode = "CLS" + if normalize_result is None: + normalize_result = False + + # for str_idx, module in model._modules.items(): + # print(f"=== idx {str_idx}, module name {module.__class__.__name__}, module {module}") + # if model_type is None and isinstance(module, Transformer): + # model_type = module.auto_model.__class__.__name__ + # model_type = model_type.lower().rstrip("model") + # elif pooling_mode is None and isinstance(module, Pooling): + # pooling_mode = module.get_pooling_mode_str().upper() + # elif normalize_result is None and isinstance(module, Normalize): + # normalize_result = True + # TODO: Support 'Dense' module + + except Exception as e: + raise Exception( + f"Raised exception while getting model data from pre-trained hugging-face model object: {e}" + ) + + # fill the description + if description is None: + readme_file_path = os.path.join(self.folder_path, "README.md") + if os.path.exists(readme_file_path): + try: + if verbose: + print("reading README.md file") + description = self._get_model_description_from_readme_file( + readme_file_path + ) + except Exception as e: + print(f"Cannot scrape model description from README.md file: {e}") + description = self._generate_default_model_description( + embedding_dimension + ) + else: + print("Using default model description") + description = "This is a question-answering model: it provides answers to a question and context." + + # dump the config.json file + if all_config is None: + if not os.path.exists(config_json_file_path): + raise Exception( + str( + "Cannot find config.json in" + + config_json_file_path + + ". Please check the config.son file in the path." + ) + ) + try: + with open(config_json_file_path) as f: + if verbose: + print("reading config file from: " + config_json_file_path) + config_content = json.load(f) + if all_config is None: + all_config = config_content + except IOError: + print( + "Cannot open in config.json file at ", + config_json_file_path, + ". Please check the config.json ", + "file in the path.", + ) + + model_config_content = { + "name": model_name, + "version": version_number, + "description": description, + "model_format": model_format, + "model_task_type": "QUESTION_ANSWERING", + "model_config": { + "model_type": model_type, + "embedding_dimension": embedding_dimension, + "framework_type": "sentence_transformers", + "pooling_mode": pooling_mode, + "normalize_result": normalize_result, + "all_config": json.dumps(all_config), + }, + } + + # get model size and hash value + if model_zip_file_path is None: + model_zip_file_path = ( + self.torch_script_zip_file_path + if model_format == "TORCH_SCRIPT" + else self.onnx_zip_file_path + ) + + # model_zip_file_path = '/Users/faradawn/CS/opensearch-py-ml/opensearch_py_ml/ml_models/question-model-folder/distilbert-base-cased-distilled-squad.zip' + + if model_zip_file_path is None: + print( + "The model configuration JSON file currently lacks the 'model_content_size_in_bytes' and 'model_content_hash_value' fields. You can include these fields by specifying the 'model_zip_file_path' parameter. Failure to do so may result in the model registration process encountering issues." + ) + else: + model_config_content["model_content_size_in_bytes"] = os.stat( + model_zip_file_path + ).st_size + model_config_content[ + "model_content_hash_value" + ] = _generate_model_content_hash_value(model_zip_file_path) + + if verbose: + print("generating ml-commons_model_config.json file...\n") + print(json.dumps(model_config_content, indent=4)) + + model_config_file_path = os.path.join( + folder_path, "ml-commons_model_config.json" + ) + os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True) + with open(model_config_file_path, "w") as file: + json.dump(model_config_content, file, indent=4) + print( + "ml-commons_model_config.json file is saved at : ", model_config_file_path + ) + + return model_config_file_path + diff --git a/tests/ml_models/test_question_answering_pytest.py b/tests/ml_models/test_question_answering_pytest.py new file mode 100644 index 00000000..bdfe81ac --- /dev/null +++ b/tests/ml_models/test_question_answering_pytest.py @@ -0,0 +1,261 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. +# How to run: pytest tests/ml_models/test_question_answering_pytest.py + +import json +import os +import shutil +from zipfile import ZipFile + +import pytest + +from opensearch_py_ml.ml_models import QuestionAnsweringModel + +TEST_FOLDER = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "test_model_files" +) +TESTDATA_FILENAME = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip.zip" +) +TESTDATA_UNZIP_FOLDER = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip" +) + +default_model_id = "distilbert-base-cased-distilled-squad" + +def clean_test_folder(TEST_FOLDER): + if os.path.exists(TEST_FOLDER): + for files in os.listdir(TEST_FOLDER): + sub_path = os.path.join(TEST_FOLDER, files) + if os.path.isfile(sub_path): + os.remove(sub_path) + else: + try: + shutil.rmtree(sub_path) + except OSError as err: + print( + "Fail to delete files, please delete all files in " + + str(TEST_FOLDER) + + " " + + str(err) + ) + + shutil.rmtree(TEST_FOLDER) + + +def compare_model_config( + model_config_path, + model_id, + model_format, + expected_model_description=None, + expected_model_config_data=None, +): + try: + with open(model_config_path) as json_file: + model_config_data = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" + + assert ( + "name" in model_config_data and model_config_data["name"] == model_id + ), f"Missing or Wrong model name in {model_format} model config file" + + assert ( + "model_format" in model_config_data + and model_config_data["model_format"] == model_format + ), f"Missing or Wrong model_format in {model_format} model config file" + + if expected_model_description is not None: + assert ( + "description" in model_config_data + and model_config_data["description"] == expected_model_description + ), f"Missing or Wrong model description in {model_format} model config file'" + + if expected_model_config_data is not None: + assert ( + "model_config" in model_config_data + ), f"Missing 'model_config' in {model_format} model config file" + + if expected_model_config_data is not None: + for k, v in expected_model_config_data.items(): + assert ( + k in model_config_data["model_config"] + and model_config_data["model_config"][k] == v + ) or ( + k not in model_config_data["model_config"] + and k == "normalize_result" + and not v + ) + + assert ( + "model_content_size_in_bytes" in model_config_data + ), f"Missing 'model_content_size_in_bytes' in {model_format} model config file" + + assert ( + "model_content_hash_value" in model_config_data + ), f"Missing 'model_content_hash_value' in {model_format} model config file" + + +def compare_model_zip_file(zip_file_path, expected_filenames, model_format): + with ZipFile(zip_file_path, "r") as f: + filenames = set(f.namelist()) + assert ( + filenames == expected_filenames + ), f"The content in the {model_format} model zip file does not match the expected content: {filenames} != {expected_filenames}" + + + + + +def test_check_attribute(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER) + try: + check_attribute = getattr(test_model, "model_id", "folder_path") + except AttributeError: + check_attribute = False + assert check_attribute + + assert test_model.folder_path == TEST_FOLDER + assert test_model.model_id == default_model_id + + default_folder = os.path.join(os.getcwd(), "question_answering_model_files") + + clean_test_folder(default_folder) + test_model0 = QuestionAnsweringModel() + assert test_model0.folder_path == default_folder + clean_test_folder(default_folder) + + clean_test_folder(TEST_FOLDER) + our_model_id = "distilbert-base-cased-distilled-squad" + test_model1 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, model_id=our_model_id + ) + assert test_model1.model_id == our_model_id + + +def test_folder_path(): + with pytest.raises(Exception) as exc_info: + test_non_empty_path = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests" + ) + QuestionAnsweringModel(folder_path=test_non_empty_path, overwrite=False) + assert exc_info.type is Exception + assert "The default folder path already exists" in exc_info.value.args[0] + + +# New tests for save_as_pt and save_as_onnx + +test_cases = [ + { + "question": "Who was Jim Henson?", + "context": "Jim Henson was a nice puppet" + }, + { + "question": "Where do I live?", + "context": "My name is Sarah and I live in London" + }, + { + "question": "What's my name?", + "context": "My name is Clara and I live in Berkeley." + }, + { + "question": "Which name is also used to describe the Amazon rainforest in English?", + "context": "The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain 'Amazonas' in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species." + } +] + +def get_official_answer(test_cases): + # Obtain pytorch's official model + from transformers import AutoTokenizer, AutoModelForQuestionAnswering + import torch + tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad') + official_model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad') + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + with torch.no_grad(): + outputs = official_model(**inputs) + answer_start_index = torch.argmax(outputs.start_logits, dim=-1).item() + answer_end_index = torch.argmax(outputs.end_logits, dim=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + +def get_pt_answer(test_cases, folder_path, model_id): + from transformers import AutoTokenizer, AutoModelForQuestionAnswering + import torch + tokenizer = AutoTokenizer.from_pretrained(model_id) + traced_model = torch.jit.load(f"{folder_path}/{model_id}.pt") + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + + with torch.no_grad(): + outputs = traced_model(**inputs) + answer_start_index = torch.argmax(outputs["start_logits"], dim=-1).item() + answer_end_index = torch.argmax(outputs["end_logits"], dim=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + + +def get_onnx_answer(test_cases, folder_path, model_id): + from transformers import AutoTokenizer + from onnxruntime import InferenceSession + import numpy as np + session = InferenceSession(f"{folder_path}/{model_id}.onnx") + tokenizer = AutoTokenizer.from_pretrained(model_id) + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + + inputs = tokenizer(question, context, return_tensors="np") + outputs = session.run(output_names=["start_logits", "end_logits"], input_feed=dict(inputs)) + + answer_start_index = np.argmax(outputs[0], axis=-1).item() + answer_end_index = np.argmax(outputs[1], axis=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + + +def test_pt_answer(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER, overwrite=True) + test_model.save_as_pt(default_model_id) + pt_results = get_pt_answer(test_cases, TEST_FOLDER, default_model_id) + official_results = get_official_answer(test_cases) + for i in range(len(pt_results)): + assert pt_results[i] == official_results[i], f"Failed at index {i}: pt_results[{i}] ({pt_results[i]}) != official_results[{i}] ({official_results[i]})" + + clean_test_folder(TEST_FOLDER) + clean_test_folder(TESTDATA_UNZIP_FOLDER) + +def test_onnx_answer(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER, overwrite=True) + test_model.save_as_onnx(default_model_id) + onnx_results = get_onnx_answer(test_cases, TEST_FOLDER, default_model_id) + official_results = get_official_answer(test_cases) + for i in range(len(onnx_results)): + assert onnx_results[i] == official_results[i], f"Failed at index {i}: onnx_results[{i}] ({onnx_results[i]}) != official_results[{i}] ({official_results[i]})" + + clean_test_folder(TEST_FOLDER) + clean_test_folder(TESTDATA_UNZIP_FOLDER) + + +clean_test_folder(TEST_FOLDER) +clean_test_folder(TESTDATA_UNZIP_FOLDER) \ No newline at end of file From 485e36160c53e6ad7548d9c9a61188e20ccd3414 Mon Sep 17 00:00:00 2001 From: faradawn Date: Thu, 30 Nov 2023 13:08:51 -0600 Subject: [PATCH 2/7] remove comments and fix init.py Signed-off-by: faradawn --- opensearch_py_ml/ml_models/__init__.py | 2 +- .../ml_models/question_answering_model.py | 46 +------------------ 2 files changed, 2 insertions(+), 46 deletions(-) diff --git a/opensearch_py_ml/ml_models/__init__.py b/opensearch_py_ml/ml_models/__init__.py index 2dd7b430..74936be6 100644 --- a/opensearch_py_ml/ml_models/__init__.py +++ b/opensearch_py_ml/ml_models/__init__.py @@ -9,4 +9,4 @@ from .sentencetransformermodel import SentenceTransformerModel from .question_answering_model import QuestionAnsweringModel -__all__ = ["SentenceTransformerModel", "SentenceTransformerModel", "MCorr"] +__all__ = ["SentenceTransformerModel", "QuestionAnsweringModel", "MCorr"] diff --git a/opensearch_py_ml/ml_models/question_answering_model.py b/opensearch_py_ml/ml_models/question_answering_model.py index fcafde68..f078126e 100644 --- a/opensearch_py_ml/ml_models/question_answering_model.py +++ b/opensearch_py_ml/ml_models/question_answering_model.py @@ -7,33 +7,12 @@ import json import os -import pickle -import platform -import random -import re -import shutil -import subprocess -import time from pathlib import Path -from typing import List from zipfile import ZipFile - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import requests import torch -import yaml -from accelerate import Accelerator, notebook_launcher -from mdutils.fileutils import MarkDownFile -# from sentence_transformers import SentenceTransformer -# from sentence_transformers.models import Normalize, Pooling, Transformer -from torch.utils.data import DataLoader -from tqdm import tqdm from transformers import AutoTokenizer, AutoModelForQuestionAnswering import transformers - from opensearch_py_ml.ml_commons.ml_common_utils import ( _generate_model_content_hash_value, ) @@ -160,19 +139,7 @@ def save_as_pt( zip_file_name = str(model_id.split("/")[-1] + ".zip") zip_file_path = os.path.join(model_output_path, zip_file_name) - # handle when model_max_length is unproperly defined in model's tokenizer (e.g. "intfloat/e5-small-v2") - # MODEL_MAX_SEQ_LENGTH = 512 - # if tokenizer.model_max_length > model.get_max_seq_length(): - # tokenizer.model_max_length = model.get_max_seq_length() - # print( - # f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {tokenizer.model_max_length}" - # ) - - # save tokenizer.json in save_json_folder_name - # max_position_embeddings - - # AutoTokenizer will save tokenizer.json in save_json_folder_name - # DistilBertTokenizer will save it in cache: /Users/faradawn/.cache/huggingface/hub/models/... + tokenizer.save_pretrained(save_json_folder_path) tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") # Open the tokenizer.json and replace the truncation field @@ -426,17 +393,6 @@ def make_model_config_json( if normalize_result is None: normalize_result = False - # for str_idx, module in model._modules.items(): - # print(f"=== idx {str_idx}, module name {module.__class__.__name__}, module {module}") - # if model_type is None and isinstance(module, Transformer): - # model_type = module.auto_model.__class__.__name__ - # model_type = model_type.lower().rstrip("model") - # elif pooling_mode is None and isinstance(module, Pooling): - # pooling_mode = module.get_pooling_mode_str().upper() - # elif normalize_result is None and isinstance(module, Normalize): - # normalize_result = True - # TODO: Support 'Dense' module - except Exception as e: raise Exception( f"Raised exception while getting model data from pre-trained hugging-face model object: {e}" From fda3f00f617d207b0e98c18a8091818f58272438 Mon Sep 17 00:00:00 2001 From: faradawn Date: Thu, 30 Nov 2023 13:09:29 -0600 Subject: [PATCH 3/7] remove model zip path comment Signed-off-by: faradawn --- opensearch_py_ml/ml_models/question_answering_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/opensearch_py_ml/ml_models/question_answering_model.py b/opensearch_py_ml/ml_models/question_answering_model.py index f078126e..ab4d06ef 100644 --- a/opensearch_py_ml/ml_models/question_answering_model.py +++ b/opensearch_py_ml/ml_models/question_answering_model.py @@ -466,8 +466,6 @@ def make_model_config_json( else self.onnx_zip_file_path ) - # model_zip_file_path = '/Users/faradawn/CS/opensearch-py-ml/opensearch_py_ml/ml_models/question-model-folder/distilbert-base-cased-distilled-squad.zip' - if model_zip_file_path is None: print( "The model configuration JSON file currently lacks the 'model_content_size_in_bytes' and 'model_content_hash_value' fields. You can include these fields by specifying the 'model_zip_file_path' parameter. Failure to do so may result in the model registration process encountering issues." From 907fcdb3c5be83f57f90b360e3588a3b66061ec6 Mon Sep 17 00:00:00 2001 From: faradawn Date: Fri, 1 Dec 2023 09:18:00 -0600 Subject: [PATCH 4/7] add onnxruntime in requirements-dev.txt Signed-off-by: faradawn --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index 16640f77..2b268553 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,6 +7,7 @@ numpy>=1.24.0,<2 opensearch-py>=2.2.0 torch==2.0.1 onnx +onnxruntime accelerate sentence_transformers tqdm From eef1ac45ff97156e60ba1573ddea1c4b6bb14e54 Mon Sep 17 00:00:00 2001 From: faradawn Date: Thu, 21 Dec 2023 18:20:56 -0600 Subject: [PATCH 5/7] add unit tests and fixed optional parameters Signed-off-by: faradawn --- .../ml_models/question_answering_model.py | 60 ++-- .../test_question_answering_pytest.py | 259 +++++++++++++++++- 2 files changed, 290 insertions(+), 29 deletions(-) diff --git a/opensearch_py_ml/ml_models/question_answering_model.py b/opensearch_py_ml/ml_models/question_answering_model.py index ab4d06ef..2ef2c450 100644 --- a/opensearch_py_ml/ml_models/question_answering_model.py +++ b/opensearch_py_ml/ml_models/question_answering_model.py @@ -5,10 +5,14 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. +# for generating config import json import os from pathlib import Path from zipfile import ZipFile +import requests + +# for torch import torch from transformers import AutoTokenizer, AutoModelForQuestionAnswering import transformers @@ -76,7 +80,22 @@ def __init__( self.torch_script_zip_file_path = None self.onnx_zip_file_path = None - + def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str): + """ + Add Apache-2.0 license file to the model zip file at model_zip_file_path + + :param model_zip_file_path: + Path to the model zip file + :type model_zip_file_path: string + :return: no return value expected + :rtype: None + """ + r = requests.get(LICENSE_URL) + assert r.status_code == 200, "Failed to add license file to the model zip file" + + with ZipFile(str(model_zip_file_path), "a") as zipObj: + zipObj.writestr("LICENSE", r.content) + def save_as_pt( self, sentences: [str] = ["today is sunny"], @@ -89,13 +108,13 @@ def save_as_pt( ) -> str: """ Download the model directly from huggingface, convert model to torch script format, - zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster + zip the model file and its tokenizer.json file to prepare to upload to the OpenSearch cluster :param sentences: - Required, for example sentences = ['today is sunny'] + Optional, for example sentences = ['today is sunny'] :type sentences: List of string [str] :param model_id: - question answering model id to download model from question answerings. + Optional, question answering model id to download model from Huggingface. default model_id = "distilbert-base-cased-distilled-squad" :type model_id: string :param model_name: @@ -203,10 +222,10 @@ def save_as_onnx( ) -> str: """ Download question answering model directly from huggingface, convert model to onnx format, - zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster + zip the model file and its tokenizer.json file to prepare to upload to the OpenSearch cluster :param model_id: - question answering model id to download model from question answerings. + Optional, question answering model id to download model from Huggingface. default model_id = "distilbert-base-cased-distilled-squad" :type model_id: string :param model_name: @@ -305,8 +324,6 @@ def save_as_onnx( print("zip file is saved to ", zip_file_path, "\n") return zip_file_path - - def make_model_config_json( self, model_name: str = None, @@ -349,7 +366,7 @@ def make_model_config_json( :param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained hugging-face model object. :type normalize_result: bool - :param description: Optional, the description of the model. If None, get description from the README.md + :param description: Optional, the description of the model. If None, use the default description. file in the model folder. :type description: str :param all_config: @@ -385,7 +402,11 @@ def make_model_config_json( ): try: if embedding_dimension is None: - embedding_dimension = model.config.dim + try: + embedding_dimension = model.config.dim + except Exception as e: + embedding_dimension = 768 + if model_type is None: model_type = "distilbert" if pooling_mode is None: @@ -400,23 +421,8 @@ def make_model_config_json( # fill the description if description is None: - readme_file_path = os.path.join(self.folder_path, "README.md") - if os.path.exists(readme_file_path): - try: - if verbose: - print("reading README.md file") - description = self._get_model_description_from_readme_file( - readme_file_path - ) - except Exception as e: - print(f"Cannot scrape model description from README.md file: {e}") - description = self._generate_default_model_description( - embedding_dimension - ) - else: - print("Using default model description") - description = "This is a question-answering model: it provides answers to a question and context." - + description = "This is a question-answering model: it provides answers to a question and context." + # dump the config.json file if all_config is None: if not os.path.exists(config_json_file_path): diff --git a/tests/ml_models/test_question_answering_pytest.py b/tests/ml_models/test_question_answering_pytest.py index bdfe81ac..efebf052 100644 --- a/tests/ml_models/test_question_answering_pytest.py +++ b/tests/ml_models/test_question_answering_pytest.py @@ -15,6 +15,11 @@ from opensearch_py_ml.ml_models import QuestionAnsweringModel + +# default parameters +default_model_id = "distilbert-base-cased-distilled-squad" +default_model_description = "This is a question-answering model: it provides answers to a question and context." + TEST_FOLDER = os.path.join( os.path.dirname(os.path.abspath("__file__")), "tests", "test_model_files" ) @@ -25,7 +30,7 @@ os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip" ) -default_model_id = "distilbert-base-cased-distilled-squad" + def clean_test_folder(TEST_FOLDER): if os.path.exists(TEST_FOLDER): @@ -110,6 +115,9 @@ def compare_model_zip_file(zip_file_path, expected_filenames, model_format): ), f"The content in the {model_format} model zip file does not match the expected content: {filenames} != {expected_filenames}" +# New +clean_test_folder(TEST_FOLDER) +test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER) @@ -257,5 +265,252 @@ def test_onnx_answer(): clean_test_folder(TESTDATA_UNZIP_FOLDER) + +def test_make_model_config_json_for_torch_script(): + model_id = default_model_id + model_format = "TORCH_SCRIPT" + expected_model_description = default_model_description + expected_model_config_data = { + "embedding_dimension": 768, + "pooling_mode": "CLS", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model5 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model5.make_model_config_json( + model_format="TORCH_SCRIPT", verbose=True + ) + + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + +def test_make_model_config_json_for_onnx(): + model_id = default_model_id + model_format = "ONNX" + expected_model_description = default_model_description + expected_model_config_data = { + "embedding_dimension": 768, + "pooling_mode": "CLS", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model6 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model6.save_as_onnx(model_id=model_id) + model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX") + + compare_model_config( + model_config_path_onnx, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + +def test_overwrite_fields_in_model_config(): + model_id = default_model_id + model_format = "TORCH_SCRIPT" + + overwritten_model_config_data = { + "embedding_dimension": 128, + "pooling_mode": "MAX", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model8 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model8.make_model_config_json( + model_format="TORCH_SCRIPT", + embedding_dimension=overwritten_model_config_data["embedding_dimension"], + pooling_mode=overwritten_model_config_data["pooling_mode"], + normalize_result=overwritten_model_config_data["normalize_result"], + ) + + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=None, + expected_model_config_data=overwritten_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + +def test_missing_expected_description_in_readme_file(): + model_id = default_model_id + model_format = "TORCH_SCRIPT" + expected_model_description = default_model_description + + clean_test_folder(TEST_FOLDER) + test_model10 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + temp_path = os.path.join( + TEST_FOLDER, + "README.md", + ) + with open(temp_path, "w") as f: + f.write("No model description here") + model_config_path_torch = test_model10.make_model_config_json( + model_format=model_format + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] == expected_model_description + ), "Should use default model description when description is missing from README.md" + + clean_test_folder(TEST_FOLDER) + + +def test_overwrite_description(): + model_id = default_model_id + model_format = "TORCH_SCRIPT" + expected_model_description = "Expected Description" + + clean_test_folder(TEST_FOLDER) + test_model11 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model11.make_model_config_json( + model_format=model_format, description=expected_model_description + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] == expected_model_description + ), "Cannot overwrite description in model config file" + + clean_test_folder(TEST_FOLDER) + + + +def test_truncation_parameter(): + model_id = default_model_id + MAX_LENGTH_TASB = 512 + + clean_test_folder(TEST_FOLDER) + test_model13 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + + tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") + try: + with open(tokenizer_json_file_path, "r") as json_file: + tokenizer_json = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating tokenizer.json file for tracing raised an exception {exec}" + + assert tokenizer_json[ + "truncation" + ], "truncation parameter in tokenizer.json is null" + + assert ( + tokenizer_json["truncation"]["max_length"] == MAX_LENGTH_TASB + ), "max_length is not properly set" + + clean_test_folder(TEST_FOLDER) + + +def test_save_as_pt_with_license(): + model_id = "distilbert-base-cased-distilled-squad" + model_format = "TORCH_SCRIPT" + torch_script_zip_file_path = os.path.join(TEST_FOLDER, "distilbert-base-cased-distilled-squad.zip") + torch_script_expected_filenames = { + "distilbert-base-cased-distilled-squad.pt", + "tokenizer.json", + "LICENSE", + } + + clean_test_folder(TEST_FOLDER) + test_model15 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model15.save_as_pt( + model_id=model_id, + sentences=["today is sunny"], + add_apache_license=True, + ) + + compare_model_zip_file( + torch_script_zip_file_path, torch_script_expected_filenames, model_format + ) + + clean_test_folder(TEST_FOLDER) + + +def test_save_as_onnx_with_license(): + model_id = "distilbert-base-cased-distilled-squad" + model_format = "ONNX" + onnx_zip_file_path = os.path.join(TEST_FOLDER, "distilbert-base-cased-distilled-squad.zip") + onnx_expected_filenames = {"distilbert-base-cased-distilled-squad.onnx", "tokenizer.json", "LICENSE"} + + clean_test_folder(TEST_FOLDER) + test_model16 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model16.save_as_onnx(model_id=model_id, add_apache_license=True) + + compare_model_zip_file(onnx_zip_file_path, onnx_expected_filenames, model_format) + + clean_test_folder(TEST_FOLDER) + + + clean_test_folder(TEST_FOLDER) -clean_test_folder(TESTDATA_UNZIP_FOLDER) \ No newline at end of file +clean_test_folder(TESTDATA_UNZIP_FOLDER) From fc625bc0655ac0bd3e62c4025062d001955be224 Mon Sep 17 00:00:00 2001 From: faradawn Date: Thu, 21 Mar 2024 10:19:36 -0500 Subject: [PATCH 6/7] add to requirements-dev, add to CHANGELOG, fix format with nox Signed-off-by: faradawn --- CHANGELOG.md | 1 + opensearch_py_ml/ml_models/__init__.py | 2 +- .../ml_models/question_answering_model.py | 44 +++++----- requirements-dev.txt | 1 + .../test_question_answering_pytest.py | 84 +++++++++++-------- 5 files changed, 74 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3689090..3fa33d73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Add support for model profiles by @rawwar in ([#358](https://github.com/opensearch-project/opensearch-py-ml/pull/358)) - Support for security default admin credential changes in 2.12.0 in ([#365](https://github.com/opensearch-project/opensearch-py-ml/pull/365)) - adding cross encoder models in the pre-trained traced list ([#378](https://github.com/opensearch-project/opensearch-py-ml/pull/378)) +- Add scripts to trace Question Answering Models (2024-03-21 09:51:56) by @faradawn ([#349](https://github.com/opensearch-project/opensearch-py-ml/pull/349)) ### Changed diff --git a/opensearch_py_ml/ml_models/__init__.py b/opensearch_py_ml/ml_models/__init__.py index 74936be6..3a05b0a7 100644 --- a/opensearch_py_ml/ml_models/__init__.py +++ b/opensearch_py_ml/ml_models/__init__.py @@ -6,7 +6,7 @@ # GitHub history for details. from .metrics_correlation.mcorr import MCorr -from .sentencetransformermodel import SentenceTransformerModel from .question_answering_model import QuestionAnsweringModel +from .sentencetransformermodel import SentenceTransformerModel __all__ = ["SentenceTransformerModel", "QuestionAnsweringModel", "MCorr"] diff --git a/opensearch_py_ml/ml_models/question_answering_model.py b/opensearch_py_ml/ml_models/question_answering_model.py index 2ef2c450..26386350 100644 --- a/opensearch_py_ml/ml_models/question_answering_model.py +++ b/opensearch_py_ml/ml_models/question_answering_model.py @@ -5,17 +5,18 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. -# for generating config +# for generating config import json import os from pathlib import Path from zipfile import ZipFile + import requests # for torch import torch -from transformers import AutoTokenizer, AutoModelForQuestionAnswering import transformers +from transformers import AutoModelForQuestionAnswering, AutoTokenizer from opensearch_py_ml.ml_commons.ml_common_utils import ( _generate_model_content_hash_value, @@ -28,6 +29,7 @@ class QuestionAnsweringModel: """ Class for tracing the QuestionAnswering model. """ + # distilbert-base-cased-distilled-squad DEFAULT_MODEL_ID = "distilbert-base-cased-distilled-squad" SYNTHETIC_QUERY_FOLDER = "synthetic_queries" @@ -79,7 +81,7 @@ def __init__( self.model_id = model_id self.torch_script_zip_file_path = None self.onnx_zip_file_path = None - + def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str): """ Add Apache-2.0 license file to the model zip file at model_zip_file_path @@ -158,7 +160,6 @@ def save_as_pt( zip_file_name = str(model_id.split("/")[-1] + ".zip") zip_file_path = os.path.join(model_output_path, zip_file_name) - tokenizer.save_pretrained(save_json_folder_path) tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") # Open the tokenizer.json and replace the truncation field @@ -177,7 +178,6 @@ def save_as_pt( with open(tokenizer_file_path, "w") as file: json.dump(parsed_json, file, indent=2) - # convert to pt format will need to be in cpu, # set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format device = torch.device("cpu") @@ -187,9 +187,7 @@ def save_as_pt( ).to(device) compiled_model = torch.jit.trace( - cpu_model, - (features["input_ids"], features["attention_mask"]), - strict=False + cpu_model, (features["input_ids"], features["attention_mask"]), strict=False ) torch.jit.save(compiled_model, model_path) print("Traced torchscript model is saved to ", model_path) @@ -210,7 +208,7 @@ def save_as_pt( self.torch_script_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") return zip_file_path - + def save_as_onnx( self, model_id="distilbert-base-cased-distilled-squad", @@ -271,7 +269,7 @@ def save_as_onnx( # save tokenizer.json in save_json_folder_name tokenizer.save_pretrained(save_json_folder_path) - + # Find the tokenizer.json file path in cache: /Users/faradawn/.cache/huggingface/hub/models/... tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") @@ -291,9 +289,13 @@ def save_as_onnx( tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") with open(tokenizer_file_path, "w") as file: json.dump(parsed_json, file, indent=2) - + # load config - model_kind, model_onnx_config = transformers.onnx.FeaturesManager.check_supported_model_or_raise(model, feature="question-answering") + model_kind, model_onnx_config = ( + transformers.onnx.FeaturesManager.check_supported_model_or_raise( + model, feature="question-answering" + ) + ) onnx_config = model_onnx_config(model.config) # export @@ -302,7 +304,7 @@ def save_as_onnx( model=model, config=onnx_config, opset=13, - output=Path(model_path) + output=Path(model_path), ) print("Traced onnx model is saved to ", model_path) @@ -392,7 +394,6 @@ def make_model_config_json( model = AutoModelForQuestionAnswering.from_pretrained(self.model_id) model.save_pretrained(self.folder_path) - # fill the empty fields if ( model_type is None @@ -404,7 +405,7 @@ def make_model_config_json( if embedding_dimension is None: try: embedding_dimension = model.config.dim - except Exception as e: + except Exception: embedding_dimension = 768 if model_type is None: @@ -413,7 +414,7 @@ def make_model_config_json( pooling_mode = "CLS" if normalize_result is None: normalize_result = False - + except Exception as e: raise Exception( f"Raised exception while getting model data from pre-trained hugging-face model object: {e}" @@ -422,7 +423,7 @@ def make_model_config_json( # fill the description if description is None: description = "This is a question-answering model: it provides answers to a question and context." - + # dump the config.json file if all_config is None: if not os.path.exists(config_json_file_path): @@ -463,7 +464,7 @@ def make_model_config_json( "all_config": json.dumps(all_config), }, } - + # get model size and hash value if model_zip_file_path is None: model_zip_file_path = ( @@ -480,9 +481,9 @@ def make_model_config_json( model_config_content["model_content_size_in_bytes"] = os.stat( model_zip_file_path ).st_size - model_config_content[ - "model_content_hash_value" - ] = _generate_model_content_hash_value(model_zip_file_path) + model_config_content["model_content_hash_value"] = ( + _generate_model_content_hash_value(model_zip_file_path) + ) if verbose: print("generating ml-commons_model_config.json file...\n") @@ -499,4 +500,3 @@ def make_model_config_json( ) return model_config_file_path - diff --git a/requirements-dev.txt b/requirements-dev.txt index e7b62bcf..10d6cc96 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,6 +14,7 @@ transformers>=4.36.0,<5 deprecated>=1.2.14,<2 mdutils>=1.6.0,<2 pillow>10.0.0,<11 +onnxruntime # # Testing diff --git a/tests/ml_models/test_question_answering_pytest.py b/tests/ml_models/test_question_answering_pytest.py index efebf052..08b8c1d6 100644 --- a/tests/ml_models/test_question_answering_pytest.py +++ b/tests/ml_models/test_question_answering_pytest.py @@ -15,10 +15,11 @@ from opensearch_py_ml.ml_models import QuestionAnsweringModel - # default parameters default_model_id = "distilbert-base-cased-distilled-squad" -default_model_description = "This is a question-answering model: it provides answers to a question and context." +default_model_description = ( + "This is a question-answering model: it provides answers to a question and context." +) TEST_FOLDER = os.path.join( os.path.dirname(os.path.abspath("__file__")), "tests", "test_model_files" @@ -31,7 +32,6 @@ ) - def clean_test_folder(TEST_FOLDER): if os.path.exists(TEST_FOLDER): for files in os.listdir(TEST_FOLDER): @@ -120,7 +120,6 @@ def compare_model_zip_file(zip_file_path, expected_filenames, model_format): test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER) - def test_check_attribute(): test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER) try: @@ -141,9 +140,7 @@ def test_check_attribute(): clean_test_folder(TEST_FOLDER) our_model_id = "distilbert-base-cased-distilled-squad" - test_model1 = QuestionAnsweringModel( - folder_path=TEST_FOLDER, model_id=our_model_id - ) + test_model1 = QuestionAnsweringModel(folder_path=TEST_FOLDER, model_id=our_model_id) assert test_model1.model_id == our_model_id @@ -160,30 +157,31 @@ def test_folder_path(): # New tests for save_as_pt and save_as_onnx test_cases = [ - { - "question": "Who was Jim Henson?", - "context": "Jim Henson was a nice puppet" - }, + {"question": "Who was Jim Henson?", "context": "Jim Henson was a nice puppet"}, { "question": "Where do I live?", - "context": "My name is Sarah and I live in London" + "context": "My name is Sarah and I live in London", }, { "question": "What's my name?", - "context": "My name is Clara and I live in Berkeley." + "context": "My name is Clara and I live in Berkeley.", }, { "question": "Which name is also used to describe the Amazon rainforest in English?", - "context": "The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain 'Amazonas' in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species." - } + "context": "The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain 'Amazonas' in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.", + }, ] + def get_official_answer(test_cases): # Obtain pytorch's official model - from transformers import AutoTokenizer, AutoModelForQuestionAnswering import torch - tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad') - official_model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad') + from transformers import AutoModelForQuestionAnswering, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad") + official_model = AutoModelForQuestionAnswering.from_pretrained( + "distilbert-base-cased-distilled-squad" + ) results = [] @@ -195,12 +193,14 @@ def get_official_answer(test_cases): answer_start_index = torch.argmax(outputs.start_logits, dim=-1).item() answer_end_index = torch.argmax(outputs.end_logits, dim=-1).item() results.append([answer_start_index, answer_end_index]) - + return results + def get_pt_answer(test_cases, folder_path, model_id): - from transformers import AutoTokenizer, AutoModelForQuestionAnswering import torch + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_id) traced_model = torch.jit.load(f"{folder_path}/{model_id}.pt") @@ -215,14 +215,15 @@ def get_pt_answer(test_cases, folder_path, model_id): answer_start_index = torch.argmax(outputs["start_logits"], dim=-1).item() answer_end_index = torch.argmax(outputs["end_logits"], dim=-1).item() results.append([answer_start_index, answer_end_index]) - + return results def get_onnx_answer(test_cases, folder_path, model_id): - from transformers import AutoTokenizer - from onnxruntime import InferenceSession import numpy as np + from onnxruntime import InferenceSession + from transformers import AutoTokenizer + session = InferenceSession(f"{folder_path}/{model_id}.onnx") tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -233,12 +234,14 @@ def get_onnx_answer(test_cases, folder_path, model_id): inputs = tokenizer(question, context, return_tensors="pt") inputs = tokenizer(question, context, return_tensors="np") - outputs = session.run(output_names=["start_logits", "end_logits"], input_feed=dict(inputs)) + outputs = session.run( + output_names=["start_logits", "end_logits"], input_feed=dict(inputs) + ) answer_start_index = np.argmax(outputs[0], axis=-1).item() answer_end_index = np.argmax(outputs[1], axis=-1).item() results.append([answer_start_index, answer_end_index]) - + return results @@ -248,24 +251,28 @@ def test_pt_answer(): pt_results = get_pt_answer(test_cases, TEST_FOLDER, default_model_id) official_results = get_official_answer(test_cases) for i in range(len(pt_results)): - assert pt_results[i] == official_results[i], f"Failed at index {i}: pt_results[{i}] ({pt_results[i]}) != official_results[{i}] ({official_results[i]})" - + assert ( + pt_results[i] == official_results[i] + ), f"Failed at index {i}: pt_results[{i}] ({pt_results[i]}) != official_results[{i}] ({official_results[i]})" + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) + def test_onnx_answer(): test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER, overwrite=True) test_model.save_as_onnx(default_model_id) onnx_results = get_onnx_answer(test_cases, TEST_FOLDER, default_model_id) official_results = get_official_answer(test_cases) for i in range(len(onnx_results)): - assert onnx_results[i] == official_results[i], f"Failed at index {i}: onnx_results[{i}] ({onnx_results[i]}) != official_results[{i}] ({official_results[i]})" - + assert ( + onnx_results[i] == official_results[i] + ), f"Failed at index {i}: onnx_results[{i}] ({onnx_results[i]}) != official_results[{i}] ({official_results[i]})" + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) - def test_make_model_config_json_for_torch_script(): model_id = default_model_id model_format = "TORCH_SCRIPT" @@ -362,6 +369,7 @@ def test_overwrite_fields_in_model_config(): clean_test_folder(TEST_FOLDER) + def test_missing_expected_description_in_readme_file(): model_id = default_model_id model_format = "TORCH_SCRIPT" @@ -430,7 +438,6 @@ def test_overwrite_description(): clean_test_folder(TEST_FOLDER) - def test_truncation_parameter(): model_id = default_model_id MAX_LENGTH_TASB = 512 @@ -466,7 +473,9 @@ def test_truncation_parameter(): def test_save_as_pt_with_license(): model_id = "distilbert-base-cased-distilled-squad" model_format = "TORCH_SCRIPT" - torch_script_zip_file_path = os.path.join(TEST_FOLDER, "distilbert-base-cased-distilled-squad.zip") + torch_script_zip_file_path = os.path.join( + TEST_FOLDER, "distilbert-base-cased-distilled-squad.zip" + ) torch_script_expected_filenames = { "distilbert-base-cased-distilled-squad.pt", "tokenizer.json", @@ -495,8 +504,14 @@ def test_save_as_pt_with_license(): def test_save_as_onnx_with_license(): model_id = "distilbert-base-cased-distilled-squad" model_format = "ONNX" - onnx_zip_file_path = os.path.join(TEST_FOLDER, "distilbert-base-cased-distilled-squad.zip") - onnx_expected_filenames = {"distilbert-base-cased-distilled-squad.onnx", "tokenizer.json", "LICENSE"} + onnx_zip_file_path = os.path.join( + TEST_FOLDER, "distilbert-base-cased-distilled-squad.zip" + ) + onnx_expected_filenames = { + "distilbert-base-cased-distilled-squad.onnx", + "tokenizer.json", + "LICENSE", + } clean_test_folder(TEST_FOLDER) test_model16 = QuestionAnsweringModel( @@ -511,6 +526,5 @@ def test_save_as_onnx_with_license(): clean_test_folder(TEST_FOLDER) - clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) From 2e0b7c5b6755df61a4c769251384f707ae88e3c7 Mon Sep 17 00:00:00 2001 From: faradawn Date: Thu, 21 Mar 2024 19:34:57 -0500 Subject: [PATCH 7/7] fix sentencetransfoer test of long description Signed-off-by: faradawn --- tests/ml_models/test_sentencetransformermodel_pytest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index c9c9046b..17f86c75 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -463,7 +463,7 @@ def test_overwrite_description(): def test_long_description(): model_id = "sentence-transformers/gtr-t5-base" model_format = "TORCH_SCRIPT" - expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search." + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of semantic search." clean_test_folder(TEST_FOLDER) test_model12 = SentenceTransformerModel(