From b0937e2e4c541c80da680c75269a15d4cb198079 Mon Sep 17 00:00:00 2001 From: Robin Picard <83579270+RobinPicard@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:36:52 +0100 Subject: [PATCH] Refactor the Transformers and TransformersVision model (#1430) The objective of this commit is to adapt `Transfomers` and `TransformersVision` to the model interface used in v1.0. The main changes are: - make the models inherit from the base class `Model` and create a `ModelTypeAdapter` for them - modify the signature of the `generate` method and make it the only way of generating text with the model. - modify the handling of model/tokenizer optional arguments to avoid manipulating them ourselves (giving to the user the responsibility of choosing those they need). - delete the tests for those models in the `generate` directory and test them instead in the `models` directory --- docs/reference/models/transformers.md | 4 +++- outlines/models/transformers.py | 2 +- outlines/types/__init__.py | 2 +- tests/generate/test_generate.py | 30 ++++++++++++------------ tests/models/test_transformers.py | 11 +++++++++ tests/models/test_transformers_vision.py | 1 + 6 files changed, 32 insertions(+), 18 deletions(-) diff --git a/docs/reference/models/transformers.md b/docs/reference/models/transformers.md index 724a724c3..dae38dd1f 100644 --- a/docs/reference/models/transformers.md +++ b/docs/reference/models/transformers.md @@ -10,8 +10,10 @@ You can use `outlines.from_transformers` to load a `transformers` model and tokenizer: +You can also provide keyword arguments in an optional `model_kwargs` parameter. Those will be passed to the `from_pretrained` method of the model class. One such argument is `device_map`, which allows you to specify the device on which the model will be loaded. + +For instance: ```python -from transformers import AutoModelForCausalLM, AutoTokenizer from outlines import models model_name = "microsoft/Phi-3-mini-4k-instruct" diff --git a/outlines/models/transformers.py b/outlines/models/transformers.py index b65d0cefb..c52f33bd8 100644 --- a/outlines/models/transformers.py +++ b/outlines/models/transformers.py @@ -316,7 +316,7 @@ def __init__(self, model: "PreTrainedModel", processor): self.processor.padding_side = "left" self.processor.pad_token = "[PAD]" - tokenizer: "PreTrainedTokenizer" = self.processor.tokenizer + tokenizer = self.processor.tokenizer super().__init__(model, tokenizer) diff --git a/outlines/types/__init__.py b/outlines/types/__init__.py index daf8cd354..bbab3fde9 100644 --- a/outlines/types/__init__.py +++ b/outlines/types/__init__.py @@ -5,9 +5,9 @@ from jsonschema import Draft202012Validator as Validator from jsonschema.exceptions import SchemaError +from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel, TypeAdapter from typing_extensions import _TypedDictMeta # type: ignore -from outlines_core.fsm.json_schema import build_regex_from_schema from . import airports, countries, locale from outlines.types.dsl import ( diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index 1bf3f2a41..2e6ab1308 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -302,21 +302,21 @@ def test_generate_json(request, model_fixture, sample_schema): generator(**get_inputs(model_fixture), max_tokens=100) -# TODO: add support for genson in the Regex type of v1.0 -# def test_integrate_genson_generate_json(request): -# from genson import SchemaBuilder -# -# builder = SchemaBuilder() -# builder.add_schema({"type": "object", "properties": {}}) -# builder.add_object({"name": "Toto", "age": 5}) -# -# model = request.getfixturevalue("model_transformers_opt125m") -# -# generator = generate.json(model, builder) -# res = generator("Return a json of a young boy") -# -# assert "name" in res -# assert "age" in res +@pytest.mark.xfail(reason="Genson has not been added to JsonType.") +def test_integrate_genson_generate_json(request): + from genson import SchemaBuilder + + builder = SchemaBuilder() + builder.add_schema({"type": "object", "properties": {}}) + builder.add_object({"name": "Toto", "age": 5}) + + model = request.getfixturevalue("model_transformers_opt125m") + + generator = generate.json(model, builder) + res = generator("Return a json of a young boy") + + assert "name" in res + assert "age" in res @pytest.mark.parametrize("model_fixture", ALL_MODEL_FIXTURES) diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index f2fb90424..8edd9979e 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -38,6 +38,17 @@ def test_transformers_instantiate_mamba(): assert isinstance(model, Transformers) +def test_transformers_instantiate_tokenizer_kwargs(): + model = outlines.from_transformers( + transformers.AutoModelForCausalLM.from_pretrained(TEST_MODEL), + transformers.AutoTokenizer.from_pretrained( + TEST_MODEL, additional_special_tokens=["", ""] + ), + ) + assert "" in model.tokenizer.special_tokens + assert "" in model.tokenizer.special_tokens + + @pytest.fixture def model(): model = outlines.from_transformers( diff --git a/tests/models/test_transformers_vision.py b/tests/models/test_transformers_vision.py index 57dedcb69..6a534eaa1 100644 --- a/tests/models/test_transformers_vision.py +++ b/tests/models/test_transformers_vision.py @@ -18,6 +18,7 @@ from outlines.models.transformers import TransformersVision from outlines.types import Choice, JsonType, Regex + TEST_MODEL = "trl-internal-testing/tiny-LlavaForConditionalGeneration" TEST_CLIP_MODEL = "openai/clip-vit-base-patch32" IMAGE_URLS = [