From fb2c3321056677b3d4528aed4436ea0fc65d57e4 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Sat, 8 Feb 2025 00:11:52 +0400 Subject: [PATCH 01/14] Generation config separation --- tests/python_tests/__init__.py | 2 + tests/python_tests/common.py | 121 +---------------- .../python_tests/test_continuous_batching.py | 8 +- .../python_tests/test_llm_pipeline_static.py | 2 +- tests/python_tests/test_sampling.py | 2 +- tests/python_tests/utils/__init__.py | 2 + tests/python_tests/utils/generation_config.py | 125 ++++++++++++++++++ 7 files changed, 138 insertions(+), 124 deletions(-) create mode 100644 tests/python_tests/__init__.py create mode 100644 tests/python_tests/utils/__init__.py create mode 100644 tests/python_tests/utils/generation_config.py diff --git a/tests/python_tests/__init__.py b/tests/python_tests/__init__.py new file mode 100644 index 0000000000..6e922cea12 --- /dev/null +++ b/tests/python_tests/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 64482e6fc0..ca26923e48 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -13,126 +13,9 @@ from transformers import GenerationConfig as HFGenerationConfig from typing import List, Tuple, Callable -TESTS_ROOT = Path(__file__).parent +from utils.generation_config import get_greedy, get_beam_search -def get_greedy() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.max_new_tokens = 30 - return generation_config - -def get_greedy_with_penalties() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.presence_penalty = 2.0 - generation_config.frequency_penalty = 0.2 - generation_config.max_new_tokens = 30 - return generation_config - -def get_beam_search() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.max_new_tokens = 30 - generation_config.num_return_sequences = 3 - generation_config.num_return_sequences = generation_config.num_beams - return generation_config - -def get_multinomial_temperature() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.do_sample = True - generation_config.temperature = 0.8 - generation_config.num_return_sequences = 1 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_temperature_and_num_return_sequence() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.do_sample = True - generation_config.temperature = 0.7 - generation_config.num_return_sequences = 3 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_temperature_and_top_p() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.do_sample = True - generation_config.temperature = 0.8 - generation_config.top_p = 0.9 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_temperature_and_top_k() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.do_sample = True - generation_config.num_return_sequences = 1 - generation_config.temperature = 0.8 - generation_config.top_k = 2 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_temperature_top_p_and_top_k() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.do_sample = True - generation_config.temperature = 0.8 - generation_config.top_p = 0.9 - generation_config.num_return_sequences = 1 - generation_config.top_k = 2 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_temperature_and_repetition_penalty() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.do_sample = True - generation_config.num_return_sequences = 1 - generation_config.temperature = 0.8 - generation_config.repetition_penalty = 2.0 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_all_parameters() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.do_sample = True - generation_config.num_return_sequences = 4 - generation_config.temperature = 0.9 - generation_config.top_p = 0.8 - generation_config.top_k = 20 - generation_config.repetition_penalty = 2.0 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_temperature_and_frequence_penalty() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.do_sample = True - generation_config.temperature = 0.8 - generation_config.frequency_penalty = 0.5 - generation_config.num_return_sequences = 1 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_temperature_and_presence_penalty() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.do_sample = True - generation_config.temperature = 0.8 - generation_config.presence_penalty = 0.1 - generation_config.num_return_sequences = 1 - generation_config.max_new_tokens = 30 - return generation_config - -def get_multinomial_max_and_min_token() -> GenerationConfig: - multinomial = GenerationConfig() - multinomial.do_sample = True - multinomial.temperature = 0.9 - multinomial.top_p = 0.9 - multinomial.top_k = 20 - multinomial.num_return_sequences = 3 - multinomial.presence_penalty = 0.01 - multinomial.frequency_penalty = 0.1 - multinomial.min_new_tokens = 15 - multinomial.max_new_tokens = 30 - return multinomial +TESTS_ROOT = Path(__file__).parent def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]: prompts = [ diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index 8afcc8061c..808b8682ae 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -11,9 +11,7 @@ from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, GenerationConfig, SchedulerConfig, Tokenizer, draft_model from common import get_default_properties, get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \ - get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \ - get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p + get_scheduler_config, run_cb_pipeline_with_ref from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts from ov_genai_test_utils import ( @@ -35,6 +33,10 @@ def read_models_list(file_name: str): from shutil import rmtree +from utils.generation_config import get_greedy, get_beam_search, \ + get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ + get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p + # # e2e tests on random and real models # diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index ae5c475fd9..dd329eb131 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -14,7 +14,7 @@ ) from common import get_default_properties -from common import \ +from utils.generation_config import \ get_greedy, \ get_greedy_with_penalties, \ get_multinomial_all_parameters, \ diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index fa445e96f1..edc5f1a29a 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -164,7 +164,7 @@ class RandomSamplingTestStruct: prompts: List[str] ref_texts: List[List[str]] -from common import get_multinomial_temperature, get_greedy_with_penalties, \ +from utils.generation_config import get_multinomial_temperature, get_greedy_with_penalties, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ get_multinomial_temperature_top_p_and_top_k, get_multinomial_all_parameters, \ get_multinomial_temperature_and_num_return_sequence, get_multinomial_max_and_min_token, \ diff --git a/tests/python_tests/utils/__init__.py b/tests/python_tests/utils/__init__.py new file mode 100644 index 0000000000..6e922cea12 --- /dev/null +++ b/tests/python_tests/utils/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/tests/python_tests/utils/generation_config.py b/tests/python_tests/utils/generation_config.py new file mode 100644 index 0000000000..1a78eeaedd --- /dev/null +++ b/tests/python_tests/utils/generation_config.py @@ -0,0 +1,125 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# todo: CVS-162108: remove this file to habdle generation config directly in tests + +from openvino_genai import GenerationConfig + +def get_greedy() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = 30 + return generation_config + +def get_greedy_with_penalties() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.presence_penalty = 2.0 + generation_config.frequency_penalty = 0.2 + generation_config.max_new_tokens = 30 + return generation_config + +def get_beam_search() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 + generation_config.max_new_tokens = 30 + generation_config.num_return_sequences = 3 + generation_config.num_return_sequences = generation_config.num_beams + return generation_config + +def get_multinomial_temperature() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_num_return_sequence() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.7 + generation_config.num_return_sequences = 3 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_top_p() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.top_p = 0.9 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_top_k() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.num_return_sequences = 1 + generation_config.temperature = 0.8 + generation_config.top_k = 2 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_top_p_and_top_k() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.top_p = 0.9 + generation_config.num_return_sequences = 1 + generation_config.top_k = 2 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_repetition_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.num_return_sequences = 1 + generation_config.temperature = 0.8 + generation_config.repetition_penalty = 2.0 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_all_parameters() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.num_return_sequences = 4 + generation_config.temperature = 0.9 + generation_config.top_p = 0.8 + generation_config.top_k = 20 + generation_config.repetition_penalty = 2.0 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_frequence_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.frequency_penalty = 0.5 + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_presence_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.presence_penalty = 0.1 + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_max_and_min_token() -> GenerationConfig: + multinomial = GenerationConfig() + multinomial.do_sample = True + multinomial.temperature = 0.9 + multinomial.top_p = 0.9 + multinomial.top_k = 20 + multinomial.num_return_sequences = 3 + multinomial.presence_penalty = 0.01 + multinomial.frequency_penalty = 0.1 + multinomial.min_new_tokens = 15 + multinomial.max_new_tokens = 30 + return multinomial From f1b0237a1716cf1ef2d4c551abf16e268c169090 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Sat, 8 Feb 2025 00:49:08 +0400 Subject: [PATCH 02/14] Hugging face --- tests/python_tests/common.py | 182 +----------------- tests/python_tests/ov_genai_test_utils.py | 14 +- .../python_tests/test_continuous_batching.py | 5 +- tests/python_tests/test_kv_cache_eviction.py | 14 +- tests/python_tests/test_llm_pipeline.py | 7 +- .../python_tests/test_llm_pipeline_static.py | 9 +- tests/python_tests/test_vlm_pipeline.py | 7 +- tests/python_tests/utils/constants.py | 10 + tests/python_tests/utils/hugging_face.py | 180 +++++++++++++++++ 9 files changed, 226 insertions(+), 202 deletions(-) create mode 100644 tests/python_tests/utils/constants.py create mode 100644 tests/python_tests/utils/hugging_face.py diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index ca26923e48..3e79ee15e2 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -14,6 +14,8 @@ from typing import List, Tuple, Callable from utils.generation_config import get_greedy, get_beam_search +from utils.constants import default_ov_config +from utils.hugging_face import convert_models, get_hugging_face_models, run_hugging_face TESTS_ROOT = Path(__file__).parent @@ -55,144 +57,6 @@ def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig: return scheduler_config -def convert_to_hf( - default_generation_config : HFGenerationConfig, - generation_config : GenerationConfig -) -> HFGenerationConfig: - if generation_config is None: - return - - kwargs = {} - kwargs['return_dict_in_generate'] = True - - # generic parameters - kwargs['max_length'] = generation_config.max_length - # has higher priority than 'max_length' - kwargs['max_new_tokens'] = generation_config.max_new_tokens - kwargs['min_new_tokens'] = generation_config.min_new_tokens - if generation_config.stop_strings: - kwargs['stop_strings'] = generation_config.stop_strings - - # copy default parameters - kwargs['bos_token_id'] = default_generation_config.bos_token_id - kwargs['pad_token_id'] = default_generation_config.pad_token_id - - if len(generation_config.stop_token_ids) > 0: - kwargs['eos_token_id'] = list(generation_config.stop_token_ids) - elif generation_config.eos_token_id != -1: - kwargs['eos_token_id'] = generation_config.eos_token_id - else: - kwargs['eos_token_id'] = default_generation_config.eos_token_id - - # copy penalties - kwargs['repetition_penalty'] = generation_config.repetition_penalty - - if generation_config.is_beam_search(): - # beam search case - kwargs['num_beam_groups'] = generation_config.num_beam_groups - kwargs['num_beams'] = generation_config.num_beams - kwargs['length_penalty'] = generation_config.length_penalty - kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size - kwargs['num_return_sequences'] = generation_config.num_return_sequences - kwargs['output_scores'] = True - - if generation_config.num_beam_groups > 1: - kwargs['diversity_penalty'] = generation_config.diversity_penalty - - # in OpenVINO GenAI this parameter is called stop_criteria, - # while in HF it's called early_stopping. - # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" - STOP_CRITERIA_MAP = { - StopCriteria.NEVER: "never", - StopCriteria.EARLY: True, - StopCriteria.HEURISTIC: False - } - - kwargs['early_stopping'] = STOP_CRITERIA_MAP[generation_config.stop_criteria] - elif generation_config.is_multinomial(): - # mulitinomial - kwargs['temperature'] = generation_config.temperature - kwargs['top_k'] = generation_config.top_k - kwargs['top_p'] = generation_config.top_p - kwargs['do_sample'] = generation_config.do_sample - else: - # greedy - pass - - hf_generation_config = HFGenerationConfig(**kwargs) - return hf_generation_config - - -def run_hugging_face( - opt_model, - hf_tokenizer, - prompts: List[str], - generation_configs: List[GenerationConfig] | GenerationConfig, -) -> List[GenerationResult]: - generation_results = [] - - if type(generation_configs) is list: - # process prompt by promp as we have multiple generation configs - for prompt, generation_config in zip(prompts, generation_configs): - hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config) - inputs = {} - if hf_tokenizer.chat_template and generation_config.apply_chat_template: - prompt = hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - inputs = hf_tokenizer(prompt, return_tensors="pt", add_special_tokens=False) - else: - inputs = hf_tokenizer(prompt, return_tensors="pt") - input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] - prompt_len = 0 if generation_config.echo else input_ids.numel() - - generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer) - all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) - - generation_result = GenerationResult() - generation_result.m_generation_ids = all_text_batch - # sequences_scores are available only for beam search case - if generation_config.is_beam_search(): - generation_result.m_scores = [score for score in generate_outputs.sequences_scores] - generation_results.append(generation_result) - else: - inputs = {} - if hf_tokenizer.chat_template and generation_configs.apply_chat_template: - processed_prompts = [] - for prompt in prompts: - processed_prompts.append(hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)) - # process all prompts as a single batch as we have a single generation config for all prompts - inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left') - else: - inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, padding_side='left') - input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] - hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs) - hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer) - - generation_ids = [] - scores = [] - - for idx, hf_encoded_out in enumerate(hf_encoded_outputs.sequences): - prompt_idx = idx // hf_generation_config.num_return_sequences - prompt_len = 0 if generation_configs.echo else input_ids[prompt_idx].numel() - decoded_text = hf_tokenizer.decode(hf_encoded_out[prompt_len:], skip_special_tokens=True) - generation_ids.append(decoded_text) - if generation_configs.is_beam_search(): - scores.append(hf_encoded_outputs.sequences_scores[idx]) - - # if we need to move to next generation result - if (idx + 1) // hf_generation_config.num_return_sequences != prompt_idx: - generation_result = GenerationResult() - generation_result.m_generation_ids = generation_ids - generation_result.m_scores = scores - generation_results.append(generation_result) - generation_ids = [] - scores = [] - - del hf_tokenizer - del opt_model - - return generation_results - - def run_continuous_batching( models_path : Path, scheduler_config : SchedulerConfig, @@ -202,7 +66,7 @@ def run_continuous_batching( if type(generation_configs) is not list: generation_configs = [generation_configs] * len(prompts) - cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties()) + cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=default_ov_config) output = cb_pipe.generate(prompts, generation_configs) del cb_pipe @@ -211,15 +75,6 @@ def run_continuous_batching( return output -def get_default_properties(): - import openvino.properties.hint as hints - import openvino as ov - - return { - hints.inference_precision : ov.Type.f32, - hints.kv_cache_precision : ov.Type.f16, - } - def get_models_list_from_path(file_name: str): models = [] with open(file_name) as f: @@ -259,7 +114,7 @@ def run_llm_pipeline( use_cb : bool = False, streamer: StreamerWithResults | Callable | StreamerBase = None ) -> List[GenerationResult]: - properties = get_default_properties() + properties = default_ov_config if use_cb: properties['scheduler_config'] = SchedulerConfig() ov_pipe = LLMPipeline(models_path, device='CPU', **properties) @@ -328,35 +183,6 @@ def compare_generation_results(prompts: List[str], hf_results: List[GenerationRe print(f"Prompt = {prompt}\nReference result = {ref_result}\nOpenVINO result = {ov_result.m_generation_ids}") compare_generation_result(ref_result, ov_result, generation_config) - -def get_hugging_face_models(model_id: str): - hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties()) - return opt_model, hf_tokenizer - - -def convert_models(opt_model : OVModelForCausalLM, hf_tokenizer : AutoTokenizer, models_path: Path): - opt_model.save_pretrained(models_path) - - # to store tokenizer config jsons with special tokens - hf_tokenizer.save_pretrained(models_path) - - # save generation config - opt_model.generation_config.save_pretrained(models_path) - - # convert tokenizers as well - convert_and_save_tokenizer(hf_tokenizer, models_path) - - -def convert_and_save_tokenizer(hf_tokenizer : AutoTokenizer, models_path: Path): - from openvino_tokenizers import convert_tokenizer - from openvino import save_model - - tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) - save_model(tokenizer, models_path / "openvino_tokenizer.xml") - save_model(detokenizer, models_path / "openvino_detokenizer.xml") - - def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_config: GenerationConfig | dict, diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 19628b2f70..5c9be11942 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -14,7 +14,9 @@ import json import openvino_genai as ov_genai -from common import get_default_properties, delete_rt_info +from common import delete_rt_info + +from utils.constants import default_ov_config def get_models_list(): precommit_models = [ @@ -92,7 +94,7 @@ def read_model(params, **tokenizer_kwargs): if (models_path / "openvino_model.xml").exists(): opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True, - compile=False, device='CPU', ov_config=get_default_properties()) + compile=False, device='CPU', ov_config=default_ov_config) else: ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, @@ -104,7 +106,7 @@ def read_model(params, **tokenizer_kwargs): hf_tokenizer.save_pretrained(models_path) opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, - compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_properties()) + compile=False, device='CPU', load_in_8bit=False, ov_config=default_ov_config) opt_model.generation_config.save_pretrained(models_path) opt_model.config.save_pretrained(models_path) opt_model.save_pretrained(models_path) @@ -114,7 +116,7 @@ def read_model(params, **tokenizer_kwargs): models_path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_properties()), + ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **default_ov_config), ) @@ -179,7 +181,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_properties()) + ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **default_ov_config) for _, config_name in configs: os.remove(temp_path / config_name) @@ -189,4 +191,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): @functools.lru_cache(1) def get_continuous_batching(path): - return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_properties()) + return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **default_ov_config) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index 808b8682ae..187628b191 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -10,7 +10,7 @@ from pathlib import Path from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, GenerationConfig, SchedulerConfig, Tokenizer, draft_model -from common import get_default_properties, get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \ +from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \ get_scheduler_config, run_cb_pipeline_with_ref from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts @@ -36,6 +36,7 @@ def read_models_list(file_name: str): from utils.generation_config import get_greedy, get_beam_search, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p +from utils.constants import default_ov_config # # e2e tests on random and real models @@ -160,7 +161,7 @@ def test_post_oom_health(tmp_path, sampling_config): models_path : Path = tmp_path / model_id convert_models(opt_model, hf_tokenizer, models_path) - cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_properties()) + cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **default_ov_config) # First run should return incomplete response output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 6dd6c57511..ae01e94a75 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -17,9 +17,11 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT, run_cb_pipeline_with_ref, get_default_properties +from common import TESTS_ROOT, run_cb_pipeline_with_ref from utils_longbench import dataset2maxlen, evaluate, preprocess_prompt, post_process_pred +from utils.constants import default_ov_config + def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: file_path = TESTS_ROOT / 'data' / file_name @@ -45,7 +47,7 @@ class ConvertedModel: @pytest.fixture(scope='module') def converted_model(tmp_path_factory): model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_properties()) + model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=default_ov_config) tokenizer = AutoTokenizer.from_pretrained(model_id) models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id model.save_pretrained(models_path) @@ -124,8 +126,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_config_opt.enable_prefix_caching = enable_prefix_caching models_path = converted_model.models_path - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_properties()) - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_properties()) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, default_ov_config) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, default_ov_config) tokenizer = converted_model.tokenizer @@ -237,8 +239,8 @@ def test_optimized_generation_longbench(qwen2_converted_model, device, test_stru if scheduler_config_opt.use_cache_eviction: scheduler_config_opt.cache_eviction_config = LONGBENCH_CACHE_EVICTION_CONFIG - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_properties()) - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_properties()) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, default_ov_config) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, default_ov_config) model_name = "/".join(models_path.parts[-2:]) subset = test_struct.subset diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 276aff7251..5fc5453200 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -11,7 +11,7 @@ from pathlib import Path import torch -from common import run_llm_pipeline_with_ref, convert_to_hf +from common import run_llm_pipeline_with_ref from ov_genai_test_utils import ( get_models_list, read_model, @@ -19,6 +19,7 @@ get_chat_models_list, model_tmp_path, ) +from utils.hugging_face import generation_config_to_hf # # e2e work @@ -50,7 +51,7 @@ def test_encoded_inputs(model_descr, inputs): model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr) ov_generation_config = GenerationConfig(max_new_tokens=20) - hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config) + hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) input_ids, attention_mask = inputs prompt_len = input_ids.shape[1] @@ -132,7 +133,7 @@ def test_chat_scenario(model_descr, generation_config_kwargs: Dict): model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1])) ov_generation_config = GenerationConfig(**generation_config_kwargs) - hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config) + hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) ov_pipe.start_chat() for prompt in questions: diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index dd329eb131..d8b24d825e 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -12,8 +12,7 @@ get_chat_models_list, read_model ) -from common import get_default_properties - +from utils.constants import default_ov_config from utils.generation_config import \ get_greedy, \ get_greedy_with_penalties, \ @@ -33,7 +32,7 @@ 'NPUW_ONLINE_PIPELINE': 'NONE', 'PREFILL_CONFIG': { }, 'GENERATE_CONFIG': { } - } | get_default_properties() + } | default_ov_config def generate_chat_history(model_path, device, pipeline_config, questions): @@ -55,7 +54,7 @@ def test_generation_compare_with_stateful(generation_config): prompt = 'What is OpenVINO?' model_path = read_model(get_models_list()[0])[1] - stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **get_default_properties()) + stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **default_ov_config) ref_out = stateful_pipe.generate(prompt, generation_config) static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) @@ -221,7 +220,7 @@ def test_chat_generation(): model_path = read_model(get_chat_models_list()[0])[1] - chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_properties(), questions) + chat_history_stateful = generate_chat_history(model_path, "CPU", default_ov_config, questions) chat_history_static = generate_chat_history(model_path, "NPU", common_config, questions) print('npu chat: \n{chat_history_static}\n') diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 3d1b0dccdc..3fe4272fcf 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -7,7 +7,10 @@ import transformers from optimum.intel.openvino import OVModelForVisualCausalLM from openvino_genai import VLMPipeline, GenerationConfig -from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters, get_default_properties +from common import get_image_by_link + +from utils.generation_config import get_beam_search, get_multinomial_all_parameters +from utils.constants import default_ov_config def get_ov_model(model_id, cache): model_dir = cache.mkdir(model_id.split('/')[-1]) @@ -18,7 +21,7 @@ def get_ov_model(model_id, cache): ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties()) + model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=default_ov_config) processor.chat_template = processor.tokenizer.chat_template # It seems that tiny-random-phi3-vision is saved incorrectly. That line works this around. processor.save_pretrained(model_dir) model.save_pretrained(model_dir) diff --git a/tests/python_tests/utils/constants.py b/tests/python_tests/utils/constants.py new file mode 100644 index 0000000000..b67ccca20f --- /dev/null +++ b/tests/python_tests/utils/constants.py @@ -0,0 +1,10 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import openvino.properties.hint as hints +import openvino as ov + +default_ov_config = { + hints.inference_precision : ov.Type.f32, + hints.kv_cache_precision : ov.Type.f16, +} \ No newline at end of file diff --git a/tests/python_tests/utils/hugging_face.py b/tests/python_tests/utils/hugging_face.py new file mode 100644 index 0000000000..8f8987d647 --- /dev/null +++ b/tests/python_tests/utils/hugging_face.py @@ -0,0 +1,180 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path +from typing import List + +from transformers import AutoTokenizer +from transformers import GenerationConfig as HFGenerationConfig + +from optimum.intel import OVModelForCausalLM +from openvino import save_model +from openvino_genai import GenerationResult, GenerationConfig, StopCriteria +from openvino_tokenizers import convert_tokenizer + +from utils.constants import default_ov_config + +def generation_config_to_hf( + default_generation_config : HFGenerationConfig, + generation_config : GenerationConfig +) -> HFGenerationConfig: + if generation_config is None: + return + + kwargs = {} + kwargs['return_dict_in_generate'] = True + + # generic parameters + kwargs['max_length'] = generation_config.max_length + # has higher priority than 'max_length' + kwargs['max_new_tokens'] = generation_config.max_new_tokens + kwargs['min_new_tokens'] = generation_config.min_new_tokens + if generation_config.stop_strings: + kwargs['stop_strings'] = generation_config.stop_strings + + # copy default parameters + kwargs['bos_token_id'] = default_generation_config.bos_token_id + kwargs['pad_token_id'] = default_generation_config.pad_token_id + + if len(generation_config.stop_token_ids) > 0: + kwargs['eos_token_id'] = list(generation_config.stop_token_ids) + elif generation_config.eos_token_id != -1: + kwargs['eos_token_id'] = generation_config.eos_token_id + else: + kwargs['eos_token_id'] = default_generation_config.eos_token_id + + # copy penalties + kwargs['repetition_penalty'] = generation_config.repetition_penalty + + if generation_config.is_beam_search(): + # beam search case + kwargs['num_beam_groups'] = generation_config.num_beam_groups + kwargs['num_beams'] = generation_config.num_beams + kwargs['length_penalty'] = generation_config.length_penalty + kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size + kwargs['num_return_sequences'] = generation_config.num_return_sequences + kwargs['output_scores'] = True + + if generation_config.num_beam_groups > 1: + kwargs['diversity_penalty'] = generation_config.diversity_penalty + + # in OpenVINO GenAI this parameter is called stop_criteria, + # while in HF it's called early_stopping. + # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" + STOP_CRITERIA_MAP = { + StopCriteria.NEVER: "never", + StopCriteria.EARLY: True, + StopCriteria.HEURISTIC: False + } + + kwargs['early_stopping'] = STOP_CRITERIA_MAP[generation_config.stop_criteria] + elif generation_config.is_multinomial(): + # mulitinomial + kwargs['temperature'] = generation_config.temperature + kwargs['top_k'] = generation_config.top_k + kwargs['top_p'] = generation_config.top_p + kwargs['do_sample'] = generation_config.do_sample + else: + # greedy + pass + + hf_generation_config = HFGenerationConfig(**kwargs) + return hf_generation_config + +def run_hugging_face( + opt_model, + hf_tokenizer, + prompts: List[str], + generation_configs: List[GenerationConfig] | GenerationConfig, +) -> List[GenerationResult]: + generation_results = [] + + if type(generation_configs) is list: + # process prompt by promp as we have multiple generation configs + for prompt, generation_config in zip(prompts, generation_configs): + hf_generation_config = generation_config_to_hf(opt_model.generation_config, generation_config) + inputs = {} + if hf_tokenizer.chat_template and generation_config.apply_chat_template: + prompt = hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + inputs = hf_tokenizer(prompt, return_tensors="pt", add_special_tokens=False) + else: + inputs = hf_tokenizer(prompt, return_tensors="pt") + input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] + prompt_len = 0 if generation_config.echo else input_ids.numel() + + generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer) + all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) + + generation_result = GenerationResult() + generation_result.m_generation_ids = all_text_batch + # sequences_scores are available only for beam search case + if generation_config.is_beam_search(): + generation_result.m_scores = [score for score in generate_outputs.sequences_scores] + generation_results.append(generation_result) + else: + inputs = {} + if hf_tokenizer.chat_template and generation_configs.apply_chat_template: + processed_prompts = [] + for prompt in prompts: + processed_prompts.append(hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)) + # process all prompts as a single batch as we have a single generation config for all prompts + inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left') + else: + inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, padding_side='left') + input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] + hf_generation_config = generation_config_to_hf(opt_model.generation_config, generation_configs) + hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer) + + generation_ids = [] + scores = [] + + for idx, hf_encoded_out in enumerate(hf_encoded_outputs.sequences): + prompt_idx = idx // hf_generation_config.num_return_sequences + prompt_len = 0 if generation_configs.echo else input_ids[prompt_idx].numel() + decoded_text = hf_tokenizer.decode(hf_encoded_out[prompt_len:], skip_special_tokens=True) + generation_ids.append(decoded_text) + if generation_configs.is_beam_search(): + scores.append(hf_encoded_outputs.sequences_scores[idx]) + + # if we need to move to next generation result + if (idx + 1) // hf_generation_config.num_return_sequences != prompt_idx: + generation_result = GenerationResult() + generation_result.m_generation_ids = generation_ids + generation_result.m_scores = scores + generation_results.append(generation_result) + generation_ids = [] + scores = [] + + del hf_tokenizer + del opt_model + + return generation_results + + +def get_hugging_face_models(model_id: str): + hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=default_ov_config) + return opt_model, hf_tokenizer + + +def convert_and_save_tokenizer(hf_tokenizer : AutoTokenizer, + models_path: Path): + + tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) + save_model(tokenizer, models_path / "openvino_tokenizer.xml") + save_model(detokenizer, models_path / "openvino_detokenizer.xml") + + +def convert_models(opt_model : OVModelForCausalLM, + hf_tokenizer : AutoTokenizer, + models_path: Path): + opt_model.save_pretrained(models_path) + + # to store tokenizer config jsons with special tokens + hf_tokenizer.save_pretrained(models_path) + + # save generation config + opt_model.generation_config.save_pretrained(models_path) + + # convert tokenizers as well + convert_and_save_tokenizer(hf_tokenizer, models_path) From 3a42c1cec7307da1a4450629d6ec8d2888bf9e6e Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 15:53:20 +0400 Subject: [PATCH 03/14] Test --- tests/python_tests/test_continuous_batching.py | 3 ++- tests/python_tests/test_sampling.py | 3 ++- tests/python_tests/test_tokenizer.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index 187628b191..68660ef993 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -10,7 +10,7 @@ from pathlib import Path from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, GenerationConfig, SchedulerConfig, Tokenizer, draft_model -from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \ +from common import generate_and_compare_with_reference_text, \ get_scheduler_config, run_cb_pipeline_with_ref from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts @@ -37,6 +37,7 @@ def read_models_list(file_name: str): get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p from utils.constants import default_ov_config +from utils.hugging_face import get_hugging_face_models, convert_models # # e2e tests on random and real models diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index edc5f1a29a..86a7635f4b 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -9,8 +9,9 @@ from openvino_genai import GenerationConfig, StopCriteria from typing import List, TypedDict -from common import get_hugging_face_models, convert_models, run_llm_pipeline_with_ref, run_llm_pipeline, compare_generation_results, StreamerWithResults +from common import run_llm_pipeline_with_ref, run_llm_pipeline +from utils.hugging_face import get_hugging_face_models, convert_models @pytest.mark.precommit @pytest.mark.parametrize("generation_config,prompt", diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index c1122fab7f..e866a8c9c1 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -9,7 +9,7 @@ from typing import Dict, Tuple, List import openvino_genai import json -from common import delete_rt_info, convert_and_save_tokenizer +from common import delete_rt_info from ov_genai_test_utils import ( get_models_list, get_chat_models_list, @@ -17,6 +17,8 @@ model_tmp_path, ) +from utils.hugging_face import convert_and_save_tokenizer + def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path): delete_rt_info(configs, temp_path) From d2fc50c97a793c564d25665bd4129d2888edb32c Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 16:06:23 +0400 Subject: [PATCH 04/14] comparation --- tests/python_tests/common.py | 29 +------------------------ tests/python_tests/utils/comparation.py | 0 2 files changed, 1 insertion(+), 28 deletions(-) create mode 100644 tests/python_tests/utils/comparation.py diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 3e79ee15e2..f50400073a 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -16,6 +16,7 @@ from utils.generation_config import get_greedy, get_beam_search from utils.constants import default_ov_config from utils.hugging_face import convert_models, get_hugging_face_models, run_hugging_face +from utils.comparation import compare_generation_results TESTS_ROOT = Path(__file__).parent @@ -155,34 +156,6 @@ def run_llm_pipeline( return generation_results -def compare_generation_result(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): - if generation_config.is_beam_search(): - assert len(hf_result.m_scores) == len(ov_result.m_scores) - for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores): - # Note, that for fp32 / fp16 models scores are different less than 0.001 - assert abs(hf_score - ov_score) < 0.02 - - if not generation_config.include_stop_str_in_output and len(generation_config.stop_strings) > 0: - assert len(hf_result.m_generation_ids) >= len(ov_result.m_generation_ids) - for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): - assert ov_text in hf_text - else: - assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids) - for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): - assert hf_text == ov_text - - -def compare_generation_results(prompts: List[str], hf_results: List[GenerationResult], ov_results: List[GenerationResult], generation_configs: List[GenerationConfig] | GenerationConfig): - if type(generation_configs) is not list: - generation_configs = [generation_configs] - - assert len(prompts) == len(hf_results) - assert len(prompts) == len(ov_results) - - for prompt, ref_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs): - print(f"Prompt = {prompt}\nReference result = {ref_result}\nOpenVINO result = {ov_result.m_generation_ids}") - compare_generation_result(ref_result, ov_result, generation_config) - def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_config: GenerationConfig | dict, diff --git a/tests/python_tests/utils/comparation.py b/tests/python_tests/utils/comparation.py new file mode 100644 index 0000000000..e69de29bb2 From 4e03adc9c83a3d6eebe95359981825b7b0c032d4 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 16:07:34 +0400 Subject: [PATCH 05/14] move get_image from utils --- tests/python_tests/test_vlm_pipeline.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 6bab5e706f..babf4a1c93 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -7,7 +7,6 @@ import transformers from optimum.intel.openvino import OVModelForVisualCausalLM from openvino_genai import VLMPipeline, GenerationConfig -from common import get_image_by_link from utils.generation_config import get_beam_search, get_multinomial_all_parameters from utils.constants import default_ov_config @@ -54,6 +53,20 @@ def get_ov_model(model_id, cache): "katuni4ka/tiny-random-qwen2vl", ] + +def get_image_by_link(link): + from PIL import Image + import requests + from openvino import Tensor + import numpy as np + + image = Image.open(requests.get(link, stream=True).raw) + if image.mode != 'RGB': + image = image.convert('RGB') + image_data = np.array((np.array(image.getdata()) - 128).astype(np.byte)).reshape(1, image.size[1], image.size[0], 3) + return Tensor(image_data) + + @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.parametrize("model_id", model_ids) From 2ade2dcd888a9a131e29964c674420f58d8466be Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 16:13:14 +0400 Subject: [PATCH 06/14] Tokenizer config --- tests/python_tests/data/__init__.py | 2 ++ tests/python_tests/{ => data}/tokenizer_configs.py | 2 ++ tests/python_tests/test_tokenizer.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 tests/python_tests/data/__init__.py rename tests/python_tests/{ => data}/tokenizer_configs.py (99%) diff --git a/tests/python_tests/data/__init__.py b/tests/python_tests/data/__init__.py new file mode 100644 index 0000000000..6e922cea12 --- /dev/null +++ b/tests/python_tests/data/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/data/tokenizer_configs.py similarity index 99% rename from tests/python_tests/tokenizer_configs.py rename to tests/python_tests/data/tokenizer_configs.py index 2b51dc2b0d..a0bfd7be15 100644 --- a/tests/python_tests/tokenizer_configs.py +++ b/tests/python_tests/data/tokenizer_configs.py @@ -1,3 +1,5 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 def get_tokenizer_configs(): return { diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index 6e7f53c79d..726a3163ce 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -94,7 +94,7 @@ def get_chat_templates(): "BramVanroy/Llama-2-13b-chat-dutch" } - from tokenizer_configs import get_tokenizer_configs + from data.tokenizer_configs import get_tokenizer_configs return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models] From ef8283f80242893708b56b726f56b55001cf6b95 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 16:17:30 +0400 Subject: [PATCH 07/14] longbench --- tests/python_tests/common.py | 14 -------------- tests/python_tests/test_kv_cache_eviction.py | 2 +- .../{utils_longbench.py => utils/longbench.py} | 0 3 files changed, 1 insertion(+), 15 deletions(-) rename tests/python_tests/{utils_longbench.py => utils/longbench.py} (100%) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index f50400073a..f6a1dbdb32 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -211,20 +211,6 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids): assert ref_text == ov_text - -def get_image_by_link(link): - from PIL import Image - import requests - from openvino import Tensor - import numpy as np - - image = Image.open(requests.get(link, stream=True).raw) - if image.mode != 'RGB': - image = image.convert('RGB') - image_data = np.array((np.array(image.getdata()) - 128).astype(np.byte)).reshape(1, image.size[1], image.size[0], 3) - return Tensor(image_data) - - """rt_info has the highest priority. Delete it to respect configs.""" def delete_rt_info(configs: List[Tuple], temp_path): core = openvino.Core() diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index ae01e94a75..81ae04bc3f 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -18,7 +18,7 @@ from transformers import AutoTokenizer from common import TESTS_ROOT, run_cb_pipeline_with_ref -from utils_longbench import dataset2maxlen, evaluate, preprocess_prompt, post_process_pred +from utils.longbench import dataset2maxlen, evaluate, preprocess_prompt, post_process_pred from utils.constants import default_ov_config diff --git a/tests/python_tests/utils_longbench.py b/tests/python_tests/utils/longbench.py similarity index 100% rename from tests/python_tests/utils_longbench.py rename to tests/python_tests/utils/longbench.py From 940ac3ec6dedd37a15e2a7c501b8a8e0e33b42de Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 16:19:17 +0400 Subject: [PATCH 08/14] comp --- tests/python_tests/utils/comparation.py | 38 +++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/python_tests/utils/comparation.py b/tests/python_tests/utils/comparation.py index e69de29bb2..6293d30397 100644 --- a/tests/python_tests/utils/comparation.py +++ b/tests/python_tests/utils/comparation.py @@ -0,0 +1,38 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino_genai import GenerationResult, GenerationConfig +from typing import List + +def compare_generation_result(hf_result: GenerationResult, + ov_result: GenerationResult, + generation_config: GenerationConfig): + if generation_config.is_beam_search(): + assert len(hf_result.m_scores) == len(ov_result.m_scores) + for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores): + # Note, that for fp32 / fp16 models scores are different less than 0.001 + assert abs(hf_score - ov_score) < 0.02 + + if not generation_config.include_stop_str_in_output and len(generation_config.stop_strings) > 0: + assert len(hf_result.m_generation_ids) >= len(ov_result.m_generation_ids) + for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): + assert ov_text in hf_text + else: + assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids) + for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): + assert hf_text == ov_text + + +def compare_generation_results(prompts: List[str], + hf_results: List[GenerationResult], + ov_results: List[GenerationResult], + generation_configs: List[GenerationConfig] | GenerationConfig): + if type(generation_configs) is not list: + generation_configs = [generation_configs] + + assert len(prompts) == len(hf_results) + assert len(prompts) == len(ov_results) + + for prompt, ref_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs): + print(f"Prompt = {prompt}\nReference result = {ref_result}\nOpenVINO result = {ov_result.m_generation_ids}") + compare_generation_result(ref_result, ov_result, generation_config) \ No newline at end of file From 333845f9631c61cfcc4725ab4c7f0637cd3b32af Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 16:36:30 +0400 Subject: [PATCH 09/14] remove extra init --- tests/python_tests/__init__.py | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 tests/python_tests/__init__.py diff --git a/tests/python_tests/__init__.py b/tests/python_tests/__init__.py deleted file mode 100644 index 6e922cea12..0000000000 --- a/tests/python_tests/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2018-2025 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file From 34b7303608b6188d5452a053a052f02ee64a82a5 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 16:38:06 +0400 Subject: [PATCH 10/14] remove extra --- tests/python_tests/__init__.py | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 tests/python_tests/__init__.py diff --git a/tests/python_tests/__init__.py b/tests/python_tests/__init__.py deleted file mode 100644 index 6e922cea12..0000000000 --- a/tests/python_tests/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2018-2025 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file From 82ebb74e7a13ab1b4c811068bffb82e5383cccad Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 19:29:00 +0400 Subject: [PATCH 11/14] upper case --- tests/python_tests/ov_genai_test_utils.py | 12 ++++++------ tests/python_tests/test_continuous_batching.py | 4 ++-- tests/python_tests/test_kv_cache_eviction.py | 12 ++++++------ tests/python_tests/test_llm_pipeline.py | 3 +-- tests/python_tests/test_llm_pipeline_static.py | 8 ++++---- tests/python_tests/test_vlm_pipeline.py | 4 ++-- tests/python_tests/utils/constants.py | 2 +- tests/python_tests/utils/hugging_face.py | 4 ++-- 8 files changed, 24 insertions(+), 25 deletions(-) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index e971ab14eb..b7aa0a5212 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -16,7 +16,7 @@ import openvino_genai as ov_genai from common import delete_rt_info -from utils.constants import default_ov_config +from utils.constants import DEFAULT_OV_CONFIG def get_models_list(): precommit_models = [ @@ -96,7 +96,7 @@ def read_model(params, **tokenizer_kwargs): if (models_path / "openvino_model.xml").exists(): opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True, - compile=False, device='CPU', ov_config=default_ov_config) + compile=False, device='CPU', ov_config=DEFAULT_OV_CONFIG) else: ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, @@ -108,7 +108,7 @@ def read_model(params, **tokenizer_kwargs): hf_tokenizer.save_pretrained(models_path) opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, - compile=False, device='CPU', load_in_8bit=False, ov_config=default_ov_config) + compile=False, device='CPU', load_in_8bit=False, ov_config=DEFAULT_OV_CONFIG) opt_model.generation_config.save_pretrained(models_path) opt_model.config.save_pretrained(models_path) opt_model.save_pretrained(models_path) @@ -118,7 +118,7 @@ def read_model(params, **tokenizer_kwargs): models_path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **default_ov_config), + ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **DEFAULT_OV_CONFIG), ) @@ -183,7 +183,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **default_ov_config) + ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **DEFAULT_OV_CONFIG) for _, config_name in configs: os.remove(temp_path / config_name) @@ -193,4 +193,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): @functools.lru_cache(1) def get_continuous_batching(path): - return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **default_ov_config) + return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **DEFAULT_OV_CONFIG) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index 6272dec6df..d6b4bacce1 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -36,7 +36,7 @@ def read_models_list(file_name: str): from utils.generation_config import get_greedy, get_beam_search, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p -from utils.constants import default_ov_config +from utils.constants import DEFAULT_OV_CONFIG from utils.hugging_face import get_hugging_face_models, convert_models # @@ -162,7 +162,7 @@ def test_post_oom_health(tmp_path, sampling_config): models_path : Path = tmp_path / model_id convert_models(opt_model, hf_tokenizer, models_path) - cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **default_ov_config) + cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **DEFAULT_OV_CONFIG) # First run should return incomplete response output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index ae01e94a75..9fbed7fc77 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -20,7 +20,7 @@ from common import TESTS_ROOT, run_cb_pipeline_with_ref from utils_longbench import dataset2maxlen, evaluate, preprocess_prompt, post_process_pred -from utils.constants import default_ov_config +from utils.constants import DEFAULT_OV_CONFIG def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -47,7 +47,7 @@ class ConvertedModel: @pytest.fixture(scope='module') def converted_model(tmp_path_factory): model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=default_ov_config) + model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=DEFAULT_OV_CONFIG) tokenizer = AutoTokenizer.from_pretrained(model_id) models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id model.save_pretrained(models_path) @@ -126,8 +126,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_config_opt.enable_prefix_caching = enable_prefix_caching models_path = converted_model.models_path - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, default_ov_config) - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, default_ov_config) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, DEFAULT_OV_CONFIG) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, DEFAULT_OV_CONFIG) tokenizer = converted_model.tokenizer @@ -239,8 +239,8 @@ def test_optimized_generation_longbench(qwen2_converted_model, device, test_stru if scheduler_config_opt.use_cache_eviction: scheduler_config_opt.cache_eviction_config = LONGBENCH_CACHE_EVICTION_CONFIG - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, default_ov_config) - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, default_ov_config) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, DEFAULT_OV_CONFIG) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, DEFAULT_OV_CONFIG) model_name = "/".join(models_path.parts[-2:]) subset = test_struct.subset diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 6e7790db13..52885dc280 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -20,7 +20,6 @@ model_tmp_path, ) from utils.hugging_face import generation_config_to_hf - # # e2e work # @@ -288,7 +287,7 @@ def test_chat_scenario_callback_cancel(model_descr): model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) ov_generation_config = GenerationConfig(**generation_config_kwargs) - hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config) + hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) current_iter = 0 num_iters = 3 diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index d8b24d825e..0d2db33598 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -12,7 +12,7 @@ get_chat_models_list, read_model ) -from utils.constants import default_ov_config +from utils.constants import DEFAULT_OV_CONFIG from utils.generation_config import \ get_greedy, \ get_greedy_with_penalties, \ @@ -32,7 +32,7 @@ 'NPUW_ONLINE_PIPELINE': 'NONE', 'PREFILL_CONFIG': { }, 'GENERATE_CONFIG': { } - } | default_ov_config + } | DEFAULT_OV_CONFIG def generate_chat_history(model_path, device, pipeline_config, questions): @@ -54,7 +54,7 @@ def test_generation_compare_with_stateful(generation_config): prompt = 'What is OpenVINO?' model_path = read_model(get_models_list()[0])[1] - stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **default_ov_config) + stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **DEFAULT_OV_CONFIG) ref_out = stateful_pipe.generate(prompt, generation_config) static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) @@ -220,7 +220,7 @@ def test_chat_generation(): model_path = read_model(get_chat_models_list()[0])[1] - chat_history_stateful = generate_chat_history(model_path, "CPU", default_ov_config, questions) + chat_history_stateful = generate_chat_history(model_path, "CPU", DEFAULT_OV_CONFIG, questions) chat_history_static = generate_chat_history(model_path, "NPU", common_config, questions) print('npu chat: \n{chat_history_static}\n') diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 6bab5e706f..ee20f9133e 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -10,7 +10,7 @@ from common import get_image_by_link from utils.generation_config import get_beam_search, get_multinomial_all_parameters -from utils.constants import default_ov_config +from utils.constants import DEFAULT_OV_CONFIG def get_ov_model(model_id, cache): model_dir = cache.mkdir(model_id.split('/')[-1]) @@ -21,7 +21,7 @@ def get_ov_model(model_id, cache): ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=default_ov_config) + model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=DEFAULT_OV_CONFIG) if processor.tokenizer.chat_template is not None: processor.chat_template = processor.tokenizer.chat_template # It seems that tiny-random-phi3-vision is saved incorrectly. That line works this around. processor.save_pretrained(model_dir) diff --git a/tests/python_tests/utils/constants.py b/tests/python_tests/utils/constants.py index b67ccca20f..a7f7e9db52 100644 --- a/tests/python_tests/utils/constants.py +++ b/tests/python_tests/utils/constants.py @@ -4,7 +4,7 @@ import openvino.properties.hint as hints import openvino as ov -default_ov_config = { +DEFAULT_OV_CONFIG = { hints.inference_precision : ov.Type.f32, hints.kv_cache_precision : ov.Type.f16, } \ No newline at end of file diff --git a/tests/python_tests/utils/hugging_face.py b/tests/python_tests/utils/hugging_face.py index e7087c0818..41636e701a 100644 --- a/tests/python_tests/utils/hugging_face.py +++ b/tests/python_tests/utils/hugging_face.py @@ -12,7 +12,7 @@ from openvino_genai import GenerationResult, GenerationConfig, StopCriteria from openvino_tokenizers import convert_tokenizer -from utils.constants import default_ov_config +from utils.constants import DEFAULT_OV_CONFIG def generation_config_to_hf( default_generation_config : HFGenerationConfig, @@ -156,7 +156,7 @@ def run_hugging_face( def get_hugging_face_models(model_id: str): hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=default_ov_config) + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=DEFAULT_OV_CONFIG) return opt_model, hf_tokenizer From dba2a870c3fe2bd8dff0115fa1797d7103627bfc Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 10 Feb 2025 20:05:51 +0400 Subject: [PATCH 12/14] fix tests --- tests/python_tests/common.py | 6 +++--- tests/python_tests/ov_genai_test_utils.py | 12 ++++++------ tests/python_tests/test_continuous_batching.py | 4 ++-- tests/python_tests/test_kv_cache_eviction.py | 12 ++++++------ tests/python_tests/test_llm_pipeline_static.py | 8 ++++---- tests/python_tests/test_vlm_pipeline.py | 4 ++-- tests/python_tests/utils/constants.py | 9 +++++---- tests/python_tests/utils/hugging_face.py | 4 ++-- 8 files changed, 30 insertions(+), 29 deletions(-) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 3e79ee15e2..db6e21fea3 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -14,7 +14,7 @@ from typing import List, Tuple, Callable from utils.generation_config import get_greedy, get_beam_search -from utils.constants import default_ov_config +from utils.constants import get_default_llm_propeties from utils.hugging_face import convert_models, get_hugging_face_models, run_hugging_face TESTS_ROOT = Path(__file__).parent @@ -66,7 +66,7 @@ def run_continuous_batching( if type(generation_configs) is not list: generation_configs = [generation_configs] * len(prompts) - cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=default_ov_config) + cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_llm_propeties()) output = cb_pipe.generate(prompts, generation_configs) del cb_pipe @@ -114,7 +114,7 @@ def run_llm_pipeline( use_cb : bool = False, streamer: StreamerWithResults | Callable | StreamerBase = None ) -> List[GenerationResult]: - properties = default_ov_config + properties = get_default_llm_propeties() if use_cb: properties['scheduler_config'] = SchedulerConfig() ov_pipe = LLMPipeline(models_path, device='CPU', **properties) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index b7aa0a5212..67bf51a9e4 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -16,7 +16,7 @@ import openvino_genai as ov_genai from common import delete_rt_info -from utils.constants import DEFAULT_OV_CONFIG +from utils.constants import get_default_llm_propeties def get_models_list(): precommit_models = [ @@ -96,7 +96,7 @@ def read_model(params, **tokenizer_kwargs): if (models_path / "openvino_model.xml").exists(): opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True, - compile=False, device='CPU', ov_config=DEFAULT_OV_CONFIG) + compile=False, device='CPU', ov_config=get_default_llm_propeties()) else: ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, @@ -108,7 +108,7 @@ def read_model(params, **tokenizer_kwargs): hf_tokenizer.save_pretrained(models_path) opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, - compile=False, device='CPU', load_in_8bit=False, ov_config=DEFAULT_OV_CONFIG) + compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_llm_propeties()) opt_model.generation_config.save_pretrained(models_path) opt_model.config.save_pretrained(models_path) opt_model.save_pretrained(models_path) @@ -118,7 +118,7 @@ def read_model(params, **tokenizer_kwargs): models_path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **DEFAULT_OV_CONFIG), + ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_llm_propeties()), ) @@ -183,7 +183,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **DEFAULT_OV_CONFIG) + ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_llm_propeties()) for _, config_name in configs: os.remove(temp_path / config_name) @@ -193,4 +193,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): @functools.lru_cache(1) def get_continuous_batching(path): - return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **DEFAULT_OV_CONFIG) + return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_llm_propeties()) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index d6b4bacce1..516f1234f9 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -36,7 +36,7 @@ def read_models_list(file_name: str): from utils.generation_config import get_greedy, get_beam_search, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p -from utils.constants import DEFAULT_OV_CONFIG +from utils.constants import get_default_llm_propeties from utils.hugging_face import get_hugging_face_models, convert_models # @@ -162,7 +162,7 @@ def test_post_oom_health(tmp_path, sampling_config): models_path : Path = tmp_path / model_id convert_models(opt_model, hf_tokenizer, models_path) - cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **DEFAULT_OV_CONFIG) + cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_llm_propeties()) # First run should return incomplete response output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 9fbed7fc77..9e0631266d 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -20,7 +20,7 @@ from common import TESTS_ROOT, run_cb_pipeline_with_ref from utils_longbench import dataset2maxlen, evaluate, preprocess_prompt, post_process_pred -from utils.constants import DEFAULT_OV_CONFIG +from utils.constants import get_default_llm_propeties def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -47,7 +47,7 @@ class ConvertedModel: @pytest.fixture(scope='module') def converted_model(tmp_path_factory): model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=DEFAULT_OV_CONFIG) + model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_llm_propeties()) tokenizer = AutoTokenizer.from_pretrained(model_id) models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id model.save_pretrained(models_path) @@ -126,8 +126,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_config_opt.enable_prefix_caching = enable_prefix_caching models_path = converted_model.models_path - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, DEFAULT_OV_CONFIG) - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, DEFAULT_OV_CONFIG) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_llm_propeties()) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_llm_propeties()) tokenizer = converted_model.tokenizer @@ -239,8 +239,8 @@ def test_optimized_generation_longbench(qwen2_converted_model, device, test_stru if scheduler_config_opt.use_cache_eviction: scheduler_config_opt.cache_eviction_config = LONGBENCH_CACHE_EVICTION_CONFIG - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, DEFAULT_OV_CONFIG) - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, DEFAULT_OV_CONFIG) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_llm_propeties()) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_llm_propeties()) model_name = "/".join(models_path.parts[-2:]) subset = test_struct.subset diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index 0d2db33598..15f5324331 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -12,7 +12,7 @@ get_chat_models_list, read_model ) -from utils.constants import DEFAULT_OV_CONFIG +from utils.constants import get_default_llm_propeties from utils.generation_config import \ get_greedy, \ get_greedy_with_penalties, \ @@ -32,7 +32,7 @@ 'NPUW_ONLINE_PIPELINE': 'NONE', 'PREFILL_CONFIG': { }, 'GENERATE_CONFIG': { } - } | DEFAULT_OV_CONFIG + } | get_default_llm_propeties() def generate_chat_history(model_path, device, pipeline_config, questions): @@ -54,7 +54,7 @@ def test_generation_compare_with_stateful(generation_config): prompt = 'What is OpenVINO?' model_path = read_model(get_models_list()[0])[1] - stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **DEFAULT_OV_CONFIG) + stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **get_default_llm_propeties()) ref_out = stateful_pipe.generate(prompt, generation_config) static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) @@ -220,7 +220,7 @@ def test_chat_generation(): model_path = read_model(get_chat_models_list()[0])[1] - chat_history_stateful = generate_chat_history(model_path, "CPU", DEFAULT_OV_CONFIG, questions) + chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_llm_propeties(), questions) chat_history_static = generate_chat_history(model_path, "NPU", common_config, questions) print('npu chat: \n{chat_history_static}\n') diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index ee20f9133e..ea0fd36a90 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -10,7 +10,7 @@ from common import get_image_by_link from utils.generation_config import get_beam_search, get_multinomial_all_parameters -from utils.constants import DEFAULT_OV_CONFIG +from utils.constants import get_default_llm_propeties def get_ov_model(model_id, cache): model_dir = cache.mkdir(model_id.split('/')[-1]) @@ -21,7 +21,7 @@ def get_ov_model(model_id, cache): ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=DEFAULT_OV_CONFIG) + model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_llm_propeties()) if processor.tokenizer.chat_template is not None: processor.chat_template = processor.tokenizer.chat_template # It seems that tiny-random-phi3-vision is saved incorrectly. That line works this around. processor.save_pretrained(model_dir) diff --git a/tests/python_tests/utils/constants.py b/tests/python_tests/utils/constants.py index a7f7e9db52..ef4c4b32e2 100644 --- a/tests/python_tests/utils/constants.py +++ b/tests/python_tests/utils/constants.py @@ -4,7 +4,8 @@ import openvino.properties.hint as hints import openvino as ov -DEFAULT_OV_CONFIG = { - hints.inference_precision : ov.Type.f32, - hints.kv_cache_precision : ov.Type.f16, -} \ No newline at end of file +def get_default_llm_propeties(): + return { + hints.inference_precision : ov.Type.f32, + hints.kv_cache_precision : ov.Type.f16, + } \ No newline at end of file diff --git a/tests/python_tests/utils/hugging_face.py b/tests/python_tests/utils/hugging_face.py index 41636e701a..d7f4f7e060 100644 --- a/tests/python_tests/utils/hugging_face.py +++ b/tests/python_tests/utils/hugging_face.py @@ -12,7 +12,7 @@ from openvino_genai import GenerationResult, GenerationConfig, StopCriteria from openvino_tokenizers import convert_tokenizer -from utils.constants import DEFAULT_OV_CONFIG +from utils.constants import get_default_llm_propeties def generation_config_to_hf( default_generation_config : HFGenerationConfig, @@ -156,7 +156,7 @@ def run_hugging_face( def get_hugging_face_models(model_id: str): hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=DEFAULT_OV_CONFIG) + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_llm_propeties()) return opt_model, hf_tokenizer From 5029c671ea4da739e66a80f819435855c9e72166 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Wed, 12 Feb 2025 11:58:56 +0400 Subject: [PATCH 13/14] tmp --- tests/python_tests/common.py | 6 +++--- tests/python_tests/ov_genai_test_utils.py | 12 ++++++------ tests/python_tests/test_continuous_batching.py | 4 ++-- tests/python_tests/test_kv_cache_eviction.py | 12 ++++++------ tests/python_tests/test_llm_pipeline_static.py | 8 ++++---- tests/python_tests/test_vlm_pipeline.py | 4 ++-- tests/python_tests/utils/constants.py | 2 +- tests/python_tests/utils/hugging_face.py | 4 ++-- 8 files changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 7bc51a25ec..2cad0fa432 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -14,7 +14,7 @@ from typing import List, Tuple, Callable from utils.generation_config import get_greedy, get_beam_search -from utils.constants import get_default_llm_propeties +from utils.constants import get_default_llm_properties from utils.hugging_face import convert_models, get_hugging_face_models, run_hugging_face from utils.comparation import compare_generation_results @@ -67,7 +67,7 @@ def run_continuous_batching( if type(generation_configs) is not list: generation_configs = [generation_configs] * len(prompts) - cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_llm_propeties()) + cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_llm_properties()) output = cb_pipe.generate(prompts, generation_configs) del cb_pipe @@ -115,7 +115,7 @@ def run_llm_pipeline( use_cb : bool = False, streamer: StreamerWithResults | Callable | StreamerBase = None ) -> List[GenerationResult]: - properties = get_default_llm_propeties() + properties = get_default_llm_properties() if use_cb: properties['scheduler_config'] = SchedulerConfig() ov_pipe = LLMPipeline(models_path, device='CPU', **properties) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 67bf51a9e4..03e95daf19 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -16,7 +16,7 @@ import openvino_genai as ov_genai from common import delete_rt_info -from utils.constants import get_default_llm_propeties +from utils.constants import get_default_llm_properties def get_models_list(): precommit_models = [ @@ -96,7 +96,7 @@ def read_model(params, **tokenizer_kwargs): if (models_path / "openvino_model.xml").exists(): opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True, - compile=False, device='CPU', ov_config=get_default_llm_propeties()) + compile=False, device='CPU', ov_config=get_default_llm_properties()) else: ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, @@ -108,7 +108,7 @@ def read_model(params, **tokenizer_kwargs): hf_tokenizer.save_pretrained(models_path) opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, - compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_llm_propeties()) + compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_llm_properties()) opt_model.generation_config.save_pretrained(models_path) opt_model.config.save_pretrained(models_path) opt_model.save_pretrained(models_path) @@ -118,7 +118,7 @@ def read_model(params, **tokenizer_kwargs): models_path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_llm_propeties()), + ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_llm_properties()), ) @@ -183,7 +183,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_llm_propeties()) + ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_llm_properties()) for _, config_name in configs: os.remove(temp_path / config_name) @@ -193,4 +193,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): @functools.lru_cache(1) def get_continuous_batching(path): - return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_llm_propeties()) + return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_llm_properties()) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index b260e8ec09..ba3817c071 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -38,7 +38,7 @@ def read_models_list(file_name: str): from utils.generation_config import get_greedy, get_beam_search, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p -from utils.constants import get_default_llm_propeties +from utils.constants import get_default_llm_properties from utils.hugging_face import get_hugging_face_models, convert_models # @@ -164,7 +164,7 @@ def test_post_oom_health(tmp_path, sampling_config): models_path : Path = tmp_path / model_id convert_models(opt_model, hf_tokenizer, models_path) - cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_llm_propeties()) + cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_llm_properties()) # First run should return incomplete response output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index b4312fe579..dd9717b22e 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -20,7 +20,7 @@ from common import TESTS_ROOT, run_cb_pipeline_with_ref from utils.longbench import dataset2maxlen, evaluate, preprocess_prompt, post_process_pred -from utils.constants import get_default_llm_propeties +from utils.constants import get_default_llm_properties def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -47,7 +47,7 @@ class ConvertedModel: @pytest.fixture(scope='module') def converted_model(tmp_path_factory): model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_llm_propeties()) + model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_llm_properties()) tokenizer = AutoTokenizer.from_pretrained(model_id) models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id model.save_pretrained(models_path) @@ -126,8 +126,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_config_opt.enable_prefix_caching = enable_prefix_caching models_path = converted_model.models_path - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_llm_propeties()) - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_llm_propeties()) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_llm_properties()) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_llm_properties()) tokenizer = converted_model.tokenizer @@ -239,8 +239,8 @@ def test_optimized_generation_longbench(qwen2_converted_model, device, test_stru if scheduler_config_opt.use_cache_eviction: scheduler_config_opt.cache_eviction_config = LONGBENCH_CACHE_EVICTION_CONFIG - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_llm_propeties()) - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_llm_propeties()) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_llm_properties()) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_llm_properties()) model_name = "/".join(models_path.parts[-2:]) subset = test_struct.subset diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index 15f5324331..431d9d88c4 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -12,7 +12,7 @@ get_chat_models_list, read_model ) -from utils.constants import get_default_llm_propeties +from utils.constants import get_default_llm_properties from utils.generation_config import \ get_greedy, \ get_greedy_with_penalties, \ @@ -32,7 +32,7 @@ 'NPUW_ONLINE_PIPELINE': 'NONE', 'PREFILL_CONFIG': { }, 'GENERATE_CONFIG': { } - } | get_default_llm_propeties() + } | get_default_llm_properties() def generate_chat_history(model_path, device, pipeline_config, questions): @@ -54,7 +54,7 @@ def test_generation_compare_with_stateful(generation_config): prompt = 'What is OpenVINO?' model_path = read_model(get_models_list()[0])[1] - stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **get_default_llm_propeties()) + stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **get_default_llm_properties()) ref_out = stateful_pipe.generate(prompt, generation_config) static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) @@ -220,7 +220,7 @@ def test_chat_generation(): model_path = read_model(get_chat_models_list()[0])[1] - chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_llm_propeties(), questions) + chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_llm_properties(), questions) chat_history_static = generate_chat_history(model_path, "NPU", common_config, questions) print('npu chat: \n{chat_history_static}\n') diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 0271218903..a8f5f86360 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -9,7 +9,7 @@ from openvino_genai import VLMPipeline, GenerationConfig from utils.generation_config import get_beam_search, get_multinomial_all_parameters -from utils.constants import get_default_llm_propeties +from utils.constants import get_default_llm_properties def get_ov_model(model_id, cache): model_dir = cache.mkdir(model_id.split('/')[-1]) @@ -20,7 +20,7 @@ def get_ov_model(model_id, cache): ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_llm_propeties()) + model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_llm_properties()) if processor.tokenizer.chat_template is not None: processor.chat_template = processor.tokenizer.chat_template # It seems that tiny-random-phi3-vision is saved incorrectly. That line works this around. processor.save_pretrained(model_dir) diff --git a/tests/python_tests/utils/constants.py b/tests/python_tests/utils/constants.py index ef4c4b32e2..d33b6b7bf1 100644 --- a/tests/python_tests/utils/constants.py +++ b/tests/python_tests/utils/constants.py @@ -4,7 +4,7 @@ import openvino.properties.hint as hints import openvino as ov -def get_default_llm_propeties(): +def get_default_llm_properties(): return { hints.inference_precision : ov.Type.f32, hints.kv_cache_precision : ov.Type.f16, diff --git a/tests/python_tests/utils/hugging_face.py b/tests/python_tests/utils/hugging_face.py index d7f4f7e060..eddb43412c 100644 --- a/tests/python_tests/utils/hugging_face.py +++ b/tests/python_tests/utils/hugging_face.py @@ -12,7 +12,7 @@ from openvino_genai import GenerationResult, GenerationConfig, StopCriteria from openvino_tokenizers import convert_tokenizer -from utils.constants import get_default_llm_propeties +from utils.constants import get_default_llm_properties def generation_config_to_hf( default_generation_config : HFGenerationConfig, @@ -156,7 +156,7 @@ def run_hugging_face( def get_hugging_face_models(model_id: str): hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_llm_propeties()) + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_llm_properties()) return opt_model, hf_tokenizer From 56ef645f25298fe8f8d83eaf5d3ca013427bdb2e Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Wed, 12 Feb 2025 13:01:03 +0400 Subject: [PATCH 14/14] Fix llm test --- tests/python_tests/test_llm_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 721d2900b3..7dd4d98708 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -172,7 +172,7 @@ def test_chat_scenario_several_chats_in_series(): generation_config_kwargs, _ = chat_intpus[0] ov_generation_config = GenerationConfig(**generation_config_kwargs) - hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config) + hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) for i in range(2): chat_history_hf = []