Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Tokenizer for several added special tokens #1659

Merged
33 changes: 21 additions & 12 deletions src/cpp/src/make_tokenizer_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,43 @@ using namespace ov;
using namespace ov::op;

bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {

std::shared_ptr<ov::Node> combine_seg_node;
for (auto node: model->get_ordered_ops()) {
if (strcmp(node->get_type_info().name, "CombineSegments") == 0) {
combine_seg_node = node;
}
}
if (!combine_seg_node || combine_seg_node->input_value(1).get_element_type() != ov::element::i32) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Input type is always i32, this is ensured in validate_and_infer_types()

return false;
if (!combine_seg_node) {
return false;
}

std::shared_ptr<v0::Constant> input_1_const = std::dynamic_pointer_cast<v0::Constant>(combine_seg_node->get_input_node_shared_ptr(1));
if (!input_1_const) {
return false;
size_t num_segments = (combine_seg_node->get_input_size() - 1) / 3;
std::vector<Input<Node>> const_inputs;
const_inputs.reserve(num_segments);

for (size_t i = 0; i < num_segments; i++) {
// If input is constant then it's special tokens, otherwise it's tokens from input text.
auto const_input = std::dynamic_pointer_cast<v0::Constant>(combine_seg_node->get_input_node_shared_ptr(3*i + 1));
if (const_input) {
const_inputs.emplace_back(combine_seg_node->input(3*i + 1));
}
}
if (const_inputs.empty()) {
return false;
}

op::util::VariableInfo var_info{ov::Shape{}, ov::element::boolean, ADD_SPECIAL_TOKENS_VAR_ID};
auto variable = std::make_shared<op::util::Variable>(var_info);

// Default mode is add_special_tokens.
auto default_mode_const = std::make_shared<v0::Constant>(ov::element::boolean, ov::Shape{}, std::vector{true});
auto variable = std::make_shared<op::util::Variable>(op::util::VariableInfo{Shape{}, element::boolean, ADD_SPECIAL_TOKENS_VAR_ID});
auto read_value = std::make_shared<v6::ReadValue>(default_mode_const, variable);
auto zero_constant = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{}, std::vector{0});
auto select_node = std::make_shared<v1::Select>(read_value, input_1_const, zero_constant);
combine_seg_node->input(1).replace_source_output(select_node->output(0));

for (size_t i = 0; i < const_inputs.size(); i++) {
auto select_node = std::make_shared<v1::Select>(read_value, const_inputs[i].get_source_output(), zero_constant);
const_inputs[i].replace_source_output(select_node);
}

auto assign = std::make_shared<v6::Assign>(read_value, variable);

model->add_sinks({assign});
model->add_variables({variable});
return true;
Expand Down
14 changes: 9 additions & 5 deletions tests/python_tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from optimum.intel import OVModelForCausalLM
from pathlib import Path
from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, SchedulerConfig, GenerationResult, GenerationConfig, DecodedResults, StopCriteria, StreamerBase
from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, SchedulerConfig, GenerationResult, GenerationConfig, DecodedResults, StopCriteria, StreamerBase, Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import GenerationConfig as HFGenerationConfig
from typing import List, Tuple, Callable
Expand Down Expand Up @@ -462,13 +462,17 @@ def convert_models(opt_model : OVModelForCausalLM, hf_tokenizer : AutoTokenizer,
opt_model.generation_config.save_pretrained(models_path)

# convert tokenizers as well
convert_and_save_tokenizer(hf_tokenizer, models_path)


def convert_and_save_tokenizer(hf_tokenizer : AutoTokenizer, models_path: Path):
from openvino_tokenizers import convert_tokenizer
from openvino import serialize
from openvino import save_model

tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True)
serialize(tokenizer, models_path / "openvino_tokenizer.xml")
serialize(detokenizer, models_path / "openvino_detokenizer.xml")
save_model(tokenizer, models_path / "openvino_tokenizer.xml")
save_model(detokenizer, models_path / "openvino_detokenizer.xml")


def run_llm_pipeline_with_ref(model_id: str,
prompts: List[str],
Expand Down
2 changes: 1 addition & 1 deletion tests/python_tests/test_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_stop_strings(tmp_path, generation_config):
def test_greedy(tmp_path, generation_config, prompt, use_cb):
model_id : str = "katuni4ka/tiny-random-phi3"
if sys.platform.startswith('win') and prompt.startswith('你'):
pytest.skip("For unknown reason this prompt fails on Win")
pytest.skip("CVS-160780 - Fails on Win with 'RuntimeError: No mapping for the Unicode character exists in the target multi-byte code page'")

run_llm_pipeline_with_ref(model_id=model_id,
prompts=[prompt],
Expand Down
35 changes: 22 additions & 13 deletions tests/python_tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
# SPDX-License-Identifier: Apache-2.0

import os
import sys
import pytest
import numpy as np
from transformers import AutoTokenizer
from typing import Dict, Tuple, List
import openvino_genai
import json

from common import delete_rt_info
from common import delete_rt_info, convert_and_save_tokenizer
from ov_genai_test_utils import (
get_models_list,
get_chat_models_list,
read_model,
model_tmp_path
model_tmp_path,
)


Expand Down Expand Up @@ -220,22 +220,31 @@ def test_set_chat_template():
'Why is the Sun yellow?',
'What was my first question?',
['Why is the Sun yellow?'],
"若我有一亿美元,在人工智能盛行的今天,我怎样投资才能收益最大化?",
ilya-lavrenov marked this conversation as resolved.
Show resolved Hide resolved
"如果您有任何疑问,请联系我们,我们将予以解答。",
"מחרוזת בדיקה",
"Multiline\nstring!\nWow!",
]
@pytest.mark.parametrize("model_id", [
"katuni4ka/tiny-random-phi3",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
# ("black-forest-labs/FLUX.1-dev", dict(subfolder="tokenizer")), # FLUX.1-dev has tokenizer in subfolder
])
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("prompt", prompts)
def test_encode_decode_with_special_tokens_option(prompt):
import numpy as np
model_descr = get_models_list()[0]
model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1]))
ov_tokenzier = ov_pipe.get_tokenizer()
def test_special_tokens(tmp_path, prompt, model_id):
if sys.platform.startswith('win') and isinstance(prompt, str) and (prompt.startswith('如') or prompt.endswith('ה')):
pytest.skip("CVS-160780 - Fails on Win with 'RuntimeError: No mapping for the Unicode character exists in the target multi-byte code page'")

model_id, hf_tok_load_params = (model_id[0], model_id[1]) if isinstance(model_id, tuple) else (model_id, {})

hf_tokenizer = AutoTokenizer.from_pretrained(model_id, **hf_tok_load_params)
convert_and_save_tokenizer(hf_tokenizer, tmp_path)
ov_tokenizer = openvino_genai.Tokenizer(tmp_path)

# Calling encode with 'add_special_tokens' will set state flag.
ov_res_add_spec = ov_tokenzier.encode(prompt, add_special_tokens=True).input_ids.data
ov_res_no_spec = ov_tokenzier.encode(prompt, add_special_tokens=False).input_ids.data
ov_res_add_spec = ov_tokenizer.encode(prompt, add_special_tokens=True).input_ids.data
ov_res_no_spec = ov_tokenizer.encode(prompt, add_special_tokens=False).input_ids.data
hf_res_add_spec = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=True)["input_ids"]
hf_res_no_spec = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=False)["input_ids"]
assert np.all(ov_res_add_spec == hf_res_add_spec)
Expand All @@ -246,8 +255,8 @@ def test_encode_decode_with_special_tokens_option(prompt):
assert hf_res_add_spec.size != hf_res_no_spec.size

# Decode with 'skip_special_tokens'
decoded_genai_skip_spec = ov_tokenzier.decode(hf_res_add_spec, skip_special_tokens=True)[0]
decoded_genai_no_skip = ov_tokenzier.decode(hf_res_add_spec, skip_special_tokens=False)[0]
decoded_genai_skip_spec = ov_tokenizer.decode(hf_res_add_spec, skip_special_tokens=True)[0]
decoded_genai_no_skip = ov_tokenizer.decode(hf_res_add_spec, skip_special_tokens=False)[0]
decoded_hf_skip_spec = hf_tokenizer.decode(hf_res_add_spec[0], skip_special_tokens=True)
decoded_hf_no_skip = hf_tokenizer.decode(hf_res_add_spec[0], skip_special_tokens=False)
assert decoded_genai_skip_spec == decoded_hf_skip_spec
Expand Down
Loading