diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8fe83f89a..4f3e307b4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,5 +28,5 @@ repos: rev: v1.2.0 hooks: - id: mypy - args: [--allow-redefinition] + args: [--allow-redefinition, --install-types, --non-interactive] exclude: ^examples/ diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py index 891f3d9c9..dd9a51e12 100644 --- a/outlines/fsm/json_schema.py +++ b/outlines/fsm/json_schema.py @@ -2,20 +2,40 @@ import itertools as it import json import re -from typing import Callable, Union +from typing import Any, Callable, Dict, Optional, Type, Union from jsonschema.protocols import Validator +from jsonschema.validators import validator_for from pydantic import BaseModel, create_model from referencing import Registry, Resource from referencing._core import Resolver -from referencing.jsonschema import DRAFT202012 +from referencing.jsonschema import DRAFT202012, Schema + +DEFINITIONS = { + "__whitespace__": r"(?:[ \t\n\r]*)", + "__json_object__": r"\{\s*(?&__members__)?\s*\}", + "__members__": r"(?&__member__)(\s*,\s*(?&__member__))*", + "__member__": r"(?&__string__)\s*:\s*(?&__json_value__)", + "__json_array__": r"\[\s*((?&__json_value__)(\s*,\s*(?&__json_value__))*)?\s*\]", + "__string_inner__": r"""(?:[^"\\\x00-\x1f\x7f-\x9f]|\\.)""", + "__string__": r'"(?&__string_inner__)*"', + "__number__": r"(?&__integer__)(?&__fraction__)?(?&__exponent__)?", + "__integer__": r"-?(?:0|[1-9][0-9]*)", + "__fraction__": r"\.[0-9]*", + "__exponent__": r"[eE][-+]?[0-9]+", + "__boolean__": r"true|false", + "__null__": r"null", + "__json_value__": r"(?&__string__)|(?&__number__)|(?&__json_object__)|(?&__json_array__)|(?&__boolean__)|(?&__null__)", +} -STRING_INNER = r'(?:[^"\\\x00-\x1f\x7f-\x9f]|\\.)' -STRING = f'"{STRING_INNER}*"' -INTEGER = r"(0|[1-9][0-9]*)" -NUMBER = rf"(-)?({INTEGER})(\.[0-9]+)?([eE][+-][0-9]+)?" -BOOLEAN = r"(true|false)" -NULL = r"null" +WHITESPACE = r"(?&__whitespace__)" +STRING_INNER = r"(?&__string_inner__)" +STRING = r"(?&__string__)" +INTEGER = r"(?&__integer__)" +NUMBER = r"(?&__number__)" +BOOLEAN = r"(?&__boolean__)" +NULL = r"(?&__null__)" +JSON_VALUE = r"(?&__json_value__)" type_to_regex = { "string": STRING, @@ -26,7 +46,9 @@ } -def build_regex_from_object(object: Union[str, Callable, BaseModel]): +def build_regex_from_object( + object: Union[str, Callable, BaseModel, Type[BaseModel]] +) -> str: """Turn a JSON schema into a regex that matches any JSON object that follows this schema. @@ -41,7 +63,8 @@ def build_regex_from_object(object: Union[str, Callable, BaseModel]): Parameters ---------- schema - A string that represents a JSON Schema. + A JSON schema or a Python object that can be converted to a JSON schema. + See [0] for more information. Returns ------- @@ -55,26 +78,32 @@ def build_regex_from_object(object: Union[str, Callable, BaseModel]): """ - if isinstance(object, type(BaseModel)): + # Extract the schema from the object + schema: Schema + if isinstance(object, BaseModel): schema = object.model_json_schema() + elif isinstance(object, type) and issubclass(object, BaseModel): + schema = object.model_json_schema() # type: ignore elif callable(object): schema = get_schema_from_signature(object) else: schema = json.loads(object) - Validator.check_schema(schema) + # Validate the schema against the JSON Schema specification + validator: Validator = validator_for(schema) + validator.check_schema(schema) # type: ignore # Build reference resolver - schema = Resource(contents=schema, specification=DRAFT202012) - uri = schema.id() if schema.id() is not None else "" - registry = Registry().with_resource(uri=uri, resource=schema) + resource = Resource(contents=schema, specification=DRAFT202012) + uri = resource.id() or "" + registry = Registry().with_resource(uri=uri, resource=resource) resolver = registry.resolver() - content = schema.contents + content = resource.contents return to_regex(resolver, content) -def to_regex(resolver: Resolver, instance: dict): +def to_regex(resolver: Optional[Resolver], instance: Schema) -> str: """Translate a JSON Schema instance into a regex that validates the schema. Note @@ -97,158 +126,234 @@ def to_regex(resolver: Resolver, instance: dict): instance The instance to translate """ - whitespace = r"[\n ]*" - - if "properties" in instance: - regex = "" - regex += r"\{" - for i, (name, value) in enumerate(instance["properties"].items()): - regex += f'{whitespace}"{name}"{whitespace}:{whitespace}' - regex += to_regex(resolver, value) - - # No comma after the last key-value pair in JSON - if i < len(instance["properties"]) - 1: - regex += f"{whitespace}," - - regex += f"{whitespace}" + r"\}" - - return regex - - # To validate against allOf, the given data must be valid against all of the - # given subschemas. - elif "allOf" in instance: - subregexes = [to_regex(resolver, t) for t in instance["allOf"]] - subregexes_str = [f"{subregex}" for subregex in subregexes] - return rf"({''.join(subregexes_str)})" - - # To validate against `anyOf`, the given data must be valid against - # any (one or more) of the given subschemas. - elif "anyOf" in instance: - subregexes = [to_regex(resolver, t) for t in instance["anyOf"]] - combinations = [ - "(" + "".join(c) + ")" - for r in range(1, len(subregexes) + 1) - for c in it.permutations(subregexes, r) - ] - - return rf"({'|'.join(combinations)})" - - # To validate against oneOf, the given data must be valid against exactly - # one of the given subschemas. - elif "oneOf" in instance: - subregexes = [to_regex(resolver, t) for t in instance["oneOf"]] - - xor_patterns = [] - # json schema validation ensured there is no overlapping schemas in oneOf - for subregex in subregexes: - other_subregexes = filter(lambda r: r != subregex, subregexes) - other_subregexes_str = "|".join([f"{s}" for s in other_subregexes]) - negative_lookahead = f"(?!.*({other_subregexes_str}))" - xor_patterns.append(f"({subregex}){negative_lookahead}") - - return rf"({'|'.join(xor_patterns)})" - - # The enum keyword is used to restrict a value to a fixed set of values. It - # must be an array with at least one element, where each element is unique. - elif "enum" in instance: - choices = [] - for choice in instance["enum"]: - if type(choice) in [int, float, bool, None]: - choices.append(re.escape(str(choice))) - elif type(choice) == str: - choices.append(f'"{re.escape(choice)}"') - - return f"({'|'.join(choices)})" - - elif "$ref" in instance: - path = f"{instance['$ref']}" - instance = resolver.lookup(path).contents - return to_regex(resolver, instance) - - # The type keyword may either be a string or an array: - # - If it's a string, it is the name of one of the basic types. - # - If it is an array, it must be an array of strings, where each string is - # the name of one of the basic types, and each element is unique. In this - # case, the JSON snippet is valid if it matches any of the given types. - elif "type" in instance: - instance_type = instance["type"] - if instance_type == "string": - if "maxLength" in instance or "minLength" in instance: - max_items = instance.get("maxLength", "") - min_items = instance.get("minLength", "") - try: - if int(max_items) < int(min_items): + + class Path(str): + pass + + class Regex(str): + pass + + definitions: Dict[str, Union[Path, Regex]] = { + name: Regex(regex) for name, regex in DEFINITIONS.items() + } + + def go(instance: Schema) -> str: + if isinstance(instance, bool): + if instance: + # True means any JSON object is valid + return JSON_VALUE + else: + # False means no JSON object is valid + return r"" + + if instance == {}: + # Empty object means any JSON object is valid + return JSON_VALUE + + if "properties" in instance: + regex = "" + regex += r"\{" + for i, (name, value) in enumerate(instance["properties"].items()): + regex += f'{WHITESPACE}"{name}"{WHITESPACE}:{WHITESPACE}' + regex += go(value) + + # No comma after the last key-value pair in JSON + if i < len(instance["properties"]) - 1: + regex += f"{WHITESPACE}," + + regex += f"{WHITESPACE}" + r"\}" + + return regex + + # To validate against allOf, the given data must be valid against all of the + # given subschemas. + elif "allOf" in instance: + subregexes = [go(t) for t in instance["allOf"]] + subregexes_str = [f"{subregex}" for subregex in subregexes] + return rf"({''.join(subregexes_str)})" + + # To validate against `anyOf`, the given data must be valid against + # any (one or more) of the given subschemas. + elif "anyOf" in instance: + subregexes = [go(t) for t in instance["anyOf"]] + combinations = [ + "(" + "".join(c) + ")" + for r in range(1, len(subregexes) + 1) + for c in it.permutations(subregexes, r) + ] + + return rf"({'|'.join(combinations)})" + + # To validate against oneOf, the given data must be valid against exactly + # one of the given subschemas. + elif "oneOf" in instance: + subregexes = [go(t) for t in instance["oneOf"]] + + xor_patterns = [] + # json schema validation ensured there is no overlapping schemas in oneOf + for subregex in subregexes: + other_subregexes = filter(lambda r: r != subregex, subregexes) + other_subregexes_str = "|".join([f"{s}" for s in other_subregexes]) + negative_lookahead = f"(?!.*({other_subregexes_str}))" + xor_patterns.append(f"({subregex}){negative_lookahead}") + + return rf"({'|'.join(xor_patterns)})" + + # The enum keyword is used to restrict a value to a fixed set of values. It + # must be an array with at least one element, where each element is unique. + elif "enum" in instance: + choices = [] + for choice in instance["enum"]: + if type(choice) in [int, float, bool, None]: + choices.append(re.escape(str(choice))) + elif type(choice) == str: + choices.append(f'"{re.escape(choice)}"') + + return f"({'|'.join(choices)})" + + elif "$ref" in instance: + path = f"{instance['$ref']}" + name = re.escape(path.replace("/", "_").replace("#", "").replace("$", "_")) + assert resolver is not None, "Cannot resolve references without a resolver" + if name not in definitions: + definitions[name] = Path(path) + return f"(?&{name})" + + # The type keyword may either be a string or an array: + # - If it's a string, it is the name of one of the basic types. + # - If it is an array, it must be an array of strings, where each string is + # the name of one of the basic types, and each element is unique. In this + # case, the JSON snippet is valid if it matches any of the given types. + elif "type" in instance: + instance_type = instance["type"] + if instance_type == "string": + if "maxLength" in instance or "minLength" in instance: + max_items = instance.get("maxLength", "") + min_items = instance.get("minLength", "") + try: + if int(max_items) < int(min_items): + raise ValueError( + "maxLength must be greater than or equal to minLength" + ) + except ValueError: + pass + return f'"{STRING_INNER}{{{min_items},{max_items}}}"' + elif "pattern" in instance: + pattern = instance["pattern"] + if pattern[0] == "^" and pattern[-1] == "$": + return rf'(^"{pattern[1:-1]}"$)' + else: + return rf'("{pattern}")' + else: + return type_to_regex["string"] + + elif instance_type == "number": + return type_to_regex["number"] + + elif instance_type == "integer": + return type_to_regex["integer"] + + elif instance_type == "array": + if "items" in instance: + items_regex = go(instance["items"]) + else: + # Here we need to make the choice to exclude generating list of objects + # if the specification of the object is not given, even though a JSON + # object that contains an object here would be valid under the specification. + types = [ + {"type": "boolean"}, + {"type": "null"}, + {"type": "number"}, + {"type": "integer"}, + {"type": "string"}, + ] + items_regex = rf"({'|'.join(go(t) for t in types)})" + + min_items = instance.get("minItems") + min_items = int(min_items) if min_items is not None else 0 + max_items = instance.get("maxItems") + max_items = int(max_items) if max_items is not None else None + + if min_items == 0 and max_items is None: + middle = rf"({items_regex}(,{items_regex})*)?" + + elif min_items > 0 and max_items is None: + middle = ( + rf"{items_regex}(,{items_regex})" + + r"{" + + rf"{min_items-1}," + + r"}" + ) + + elif min_items == 0 and max_items is not None: + if max_items == 0: + middle = r"" + else: + middle = ( + rf"({items_regex}(,{items_regex})" + + r"{" + + rf"0,{max_items-1}" + + r"})?" + ) + + elif min_items > 0 and max_items is not None: + if max_items >= min_items: + middle = ( + rf"{items_regex}(,{items_regex})" + + r"{" + + rf"{min_items-1},{max_items-1}" + + r"}" + ) + else: raise ValueError( - "maxLength must be greater than or equal to minLength" + "max_items must be greater than or equal to min_items" ) - except ValueError: - pass - return f'"{STRING_INNER}{{{min_items},{max_items}}}"' - elif "pattern" in instance: - pattern = instance["pattern"] - if pattern[0] == "^" and pattern[-1] == "$": - return rf'(^"{pattern[1:-1]}"$)' + else: - return rf'("{pattern}")' - else: - return type_to_regex["string"] + raise ValueError("min_items must be greater than or equal to 0") - elif instance_type == "number": - return type_to_regex["number"] + return rf"\[{middle}\]" - elif instance_type == "integer": - return type_to_regex["integer"] + elif instance_type == "boolean": + return type_to_regex["boolean"] - elif instance_type == "array": - min_items = instance.get("minItems", "0") - max_items = instance.get("maxItems", "") - if min_items == max_items: - num_repeats = "{" + str(int(min_items) - 1) + "}" - else: - num_repeats = "*" + elif instance_type == "null": + return type_to_regex["null"] - if "items" in instance: - items_regex = to_regex(resolver, instance["items"]) - return rf"\[({items_regex})(,({items_regex})){num_repeats}\]" - else: - # Here we need to make the choice to exclude generating list of objects - # if the specification of the object is not given, even though a JSON + elif isinstance(instance_type, list): + # Here we need to make the choice to exclude generating an object + # if the specification of the object is not give, even though a JSON # object that contains an object here would be valid under the specification. - types = [ - {"type": "boolean"}, - {"type": "null"}, - {"type": "number"}, - {"type": "integer"}, - {"type": "string"}, - ] - regexes = [to_regex(resolver, t) for t in types] - return ( - rf"\[({'|'.join(regexes)})(,({'|'.join(regexes)})){num_repeats}\]" - ) - - elif instance_type == "boolean": - return type_to_regex["boolean"] - - elif instance_type == "null": - return type_to_regex["null"] - - elif isinstance(instance_type, list): - # Here we need to make the choice to exclude generating an object - # if the specification of the object is not give, even though a JSON - # object that contains an object here would be valid under the specification. - regexes = [ - to_regex(resolver, {"type": t}) for t in instance_type if t != "object" - ] - return rf"({'|'.join(regexes)})" + regexes = [go({"type": t}) for t in instance_type if t != "object"] + return rf"({'|'.join(regexes)})" + + raise NotImplementedError( + f"""Could not translate the instance {instance} to a + regular expression. Make sure it is valid to the JSON Schema specification. If + it is, please open an issue on the Outlines repository""" + ) + + definitions["__self__"] = Regex(go(instance)) + + while any(isinstance(v, Path) for v in definitions.values()): + for name, value in definitions.items(): + if isinstance(value, Path): + assert ( + resolver is not None + ), "Cannot resolve references without a resolver" + definitions[name] = Regex(go(resolver.lookup(value).contents)) + + regex = r"(?:" + for name, value in definitions.items(): + assert isinstance(value, Regex) + regex += rf"(?P<{name}>{value})" + regex += r"){0}" + regex += r"(?&__self__)" - raise NotImplementedError( - f"""Could not translate the instance {instance} to a - regular expression. Make sure it is valid to the JSON Schema specification. If - it is, please open an issue on the Outlines repository""" - ) + return regex -def get_schema_from_signature(fn: Callable) -> str: +def get_schema_from_signature(fn: Callable) -> Dict[str, Any]: """Turn a function signature into a JSON schema. Every JSON object valid to the output JSON Schema can be passed diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index 454d120c7..362902c50 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -1,17 +1,19 @@ import json -import re from typing import List import pytest +import regex as re from pydantic import BaseModel, constr from outlines.fsm.json_schema import ( BOOLEAN, INTEGER, + JSON_VALUE, NULL, NUMBER, STRING, STRING_INNER, + WHITESPACE, build_regex_from_object, get_schema_from_signature, to_regex, @@ -47,8 +49,7 @@ class User(BaseModel): value: float is_true: bool - schema = json.dumps(User.model_json_schema()) - schedule = build_regex_from_object(schema) + schedule = build_regex_from_object(User) assert isinstance(schedule, str) @@ -57,7 +58,7 @@ class User(BaseModel): [ ({"integer": "0"}, True), ({"integer": "1"}, True), - ({"integer": "-1"}, False), + ({"integer": "-1"}, True), ({"integer": "01"}, False), ({"integer": "1.3"}, False), ({"integer": "t"}, False), @@ -66,11 +67,12 @@ class User(BaseModel): def test_match_integer(pattern, does_match): step = {"title": "Foo", "type": "integer"} regex = to_regex(None, step) - assert regex == INTEGER + assert regex.endswith(INTEGER) value = pattern["integer"] match = re.fullmatch(regex, value) if does_match: + assert match is not None assert match[0] == value assert match.span() == (0, len(value)) else: @@ -86,18 +88,20 @@ def test_match_integer(pattern, does_match): ({"number": ".3"}, False), ({"number": "1.3"}, True), ({"number": "-1.3"}, True), - ({"number": "1.3e9"}, False), + ({"number": "1.3e9"}, True), ({"number": "1.3e+9"}, True), ], ) def test_match_number(pattern, does_match): - step = {"title": "Foo", "type": "number"} - regex = to_regex(None, step) - assert regex == NUMBER + schema = {"title": "Foo", "type": "number"} + regex = to_regex(None, schema) + assert regex.endswith(NUMBER) + print(regex) value = pattern["number"] match = re.fullmatch(regex, value) if does_match: + assert match is not None assert match[0] == value assert match.span() == (0, len(value)) else: @@ -105,42 +109,68 @@ def test_match_number(pattern, does_match): @pytest.mark.parametrize( - "schema,regex,examples", + "schema,definitions,examples", [ + # Empty schema + ( + {}, + {"__self__": rf"{JSON_VALUE}"}, + [ + ("null", True), + ("true", True), + ("false", True), + ("0", True), + ('{"foo": "bar"}', True), + ('["foo", "bar"]', True), + ('{"foo"}', False), + ("", False), + ("1.3", True), + ('"foo"', True), + ("[]", True), + ("[,]", False), + ("{}", True), + ("[1,2]", True), + ("[1,2,]", False), + ('{"foo": "bar", "spam": "eggs"}', True), + ('{"foo": "bar", "spam": "eggs",}', False), + ('{"foo": "bar", "spam": {"eggs": "ham"}}', True), + ('{"foo": "bar", "spam": {"eggs": "ham",}}', False), + ], + ), # String ( {"title": "Foo", "type": "string"}, - STRING, + {"__self__": STRING}, [("unquotedstring", False), ('"quoted_string"', True)], ), # String with maximum length ( {"title": "Foo", "type": "string", "maxLength": 3}, - f'"{STRING_INNER}{{,3}}"', + {"__self__": rf'"{STRING_INNER}{{,3}}"'}, [('"ab"', True), ('"a""', False), ('"abcd"', False)], ), # String with minimum length ( {"title": "Foo", "type": "string", "minLength": 3}, - f'"{STRING_INNER}{{3,}}"', + {"__self__": rf'"{STRING_INNER}{{3,}}"'}, [('"ab"', False), ('"abcd"', True), ('"abc""', False)], ), # String with both minimum and maximum length ( {"title": "Foo", "type": "string", "minLength": 3, "maxLength": 5}, - f'"{STRING_INNER}{{3,5}}"', + {"__self__": rf'"{STRING_INNER}{{3,5}}"'}, [('"ab"', False), ('"abcd"', True), ('"abcdef""', False)], ), # String defined by a regular expression ( {"title": "Foo", "type": "string", "pattern": r"^[a-z]$"}, - r'(^"[a-z]"$)', + {"__self__": r'(^"[a-z]"$)'}, [('"a"', True), ('"1"', False)], ), # Boolean ( {"title": "Foo", "type": "boolean"}, - BOOLEAN, + {"__self__": BOOLEAN}, [ ("true", True), ("false", True), @@ -151,7 +181,7 @@ def test_match_number(pattern, does_match): # Null ( {"title": "Foo", "type": "null"}, - NULL, + {"__self__": NULL}, [ ("null", True), ("true", False), @@ -161,19 +191,19 @@ def test_match_number(pattern, does_match): # Enum string ( {"title": "Foo", "enum": ["Marc", "Jean"], "type": "string"}, - '("Marc"|"Jean")', + {"__self__": r'("Marc"|"Jean")'}, [('"Marc"', True), ('"Jean"', True), ('"John"', False)], ), # Make sure strings are escaped ( {"title": "Foo", "enum": [".*", r"\s*"], "type": "string"}, - r'("\.\*"|"\\s\*")', + {"__self__": r'("\.\*"|"\\s\*")'}, [('".*"', True), (r'"\s*"', True), (r'"\.\*"', False)], ), # Enum integer ( {"title": "Foo", "enum": [0, 1], "type": "integer"}, - "(0|1)", + {"__self__": r"(0|1)"}, [("0", True), ("1", True), ("a", False)], ), # integer @@ -183,14 +213,38 @@ def test_match_number(pattern, does_match): "type": "object", "properties": {"count": {"title": "Count", "type": "integer"}}, }, - '\\{[\\n ]*"count"[\\n ]*:[\\n ]*(0|[1-9][0-9]*)[\\n ]*\\}', + { + "__self__": rf'\{{{WHITESPACE}"count"{WHITESPACE}:{WHITESPACE}{INTEGER}{WHITESPACE}\}}' + }, [('{\n "count": 100\n}', True)], ), # array ( {"title": "Foo", "type": "array", "items": {"type": "number"}}, - rf"\[({NUMBER})(,({NUMBER}))*\]", - [("[1e+9,1.3]", True)], + {"__self__": rf"\[({NUMBER}(,{NUMBER})*)?\]"}, + [("[1e+9,1.3]", True), ("[1e+9,1.3,]", False), ("[]", True)], + ), + # array with a minimum length of 1 + ( + { + "title": "Foo", + "type": "array", + "items": {"type": "integer"}, + "minItems": 1, + }, + {"__self__": rf"\[{INTEGER}(,{INTEGER}){{0,}}\]"}, + [("[1]", True), ("[]", False), ("[1,2]", True)], + ), + # array with a maximum length of 1 + ( + { + "title": "Foo", + "type": "array", + "items": {"type": "integer"}, + "maxItems": 1, + }, + {"__self__": rf"\[({INTEGER}(,{INTEGER}){{0,0}})?\]"}, + [("[1]", True), ("[]", True), ("[1,2]", False)], ), # array with a set length of 1 ( @@ -201,7 +255,7 @@ def test_match_number(pattern, does_match): "minItems": 1, "maxItems": 1, }, - rf"\[({INTEGER})(,({INTEGER})){{0}}\]", + {"__self__": rf"\[{INTEGER}(,{INTEGER}){{0,0}}\]"}, [("[1]", True), ("[1,2]", False), ('["a"]', False), ("[]", False)], ), # array with a set length greather than 1 @@ -213,16 +267,30 @@ def test_match_number(pattern, does_match): "minItems": 3, "maxItems": 3, }, - rf"\[({INTEGER})(,({INTEGER})){{2}}\]", + {"__self__": rf"\[{INTEGER}(,{INTEGER}){{2,2}}\]"}, [("[1]", False), ("[]", False), ("[1,2,3]", True), ("[1,2,3,4]", False)], ), + # array with a length between 1 and 3 + ( + { + "title": "Foo", + "type": "array", + "items": {"type": "integer"}, + "minItems": 1, + "maxItems": 3, + }, + {"__self__": rf"\[{INTEGER}(,{INTEGER}){{0,2}}\]"}, + [("[1]", True), ("[]", False), ("[1,2,3]", True), ("[1,2,3,4]", False)], + ), # oneOf ( { "title": "Foo", "oneOf": [{"type": "string"}, {"type": "number"}, {"type": "boolean"}], }, - rf"(({STRING})(?!.*({NUMBER}|{BOOLEAN}))|({NUMBER})(?!.*({STRING}|{BOOLEAN}))|({BOOLEAN})(?!.*({STRING}|{NUMBER})))", + { + "__self__": rf"(({STRING})(?!.*({NUMBER}|{BOOLEAN}))|({NUMBER})(?!.*({STRING}|{BOOLEAN}))|({BOOLEAN})(?!.*({STRING}|{NUMBER})))" + }, [ ("12.3", True), ("true", True), @@ -240,7 +308,9 @@ def test_match_number(pattern, does_match): "title": "Foo", "anyOf": [{"type": "string"}, {"type": "integer"}], }, - r'(("(?:[^"\\\x00-\x1f\x7f-\x9f]|\\.)*")|((0|[1-9][0-9]*))|("(?:[^"\\\x00-\x1f\x7f-\x9f]|\\.)*"(0|[1-9][0-9]*))|((0|[1-9][0-9]*)"(?:[^"\\\x00-\x1f\x7f-\x9f]|\\.)*"))', + { + "__self__": rf"(({STRING})|({INTEGER})|({STRING}{INTEGER})|({INTEGER}{STRING}))" + }, [("12", True), ('"a"', True), ('1"a"', True)], ), # allOf @@ -249,7 +319,7 @@ def test_match_number(pattern, does_match): "title": "Foo", "allOf": [{"type": "string"}, {"type": "integer"}], }, - rf"({STRING}{INTEGER})", + {"__self__": rf"({STRING}{INTEGER})"}, [('"a"1', True), ('"a"', False), ('"1"', False)], ), # Nested schema @@ -265,7 +335,9 @@ def test_match_number(pattern, does_match): } }, }, - f'\\{{[\\n ]*"fuzz"[\\n ]*:[\\n ]*\\{{[\\n ]*"spam"[\\n ]*:[\\n ]*{INTEGER}[\\n ]*\\}}[\\n ]*\\}}', + { + "__self__": rf'\{{{WHITESPACE}"fuzz"{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}"spam"{WHITESPACE}:{WHITESPACE}{INTEGER}{WHITESPACE}\}}{WHITESPACE}\}}' + }, [('{\n "fuzz": {\n "spam": 100\n }\n}', True)], ), # Schema with a reference @@ -280,7 +352,10 @@ def test_match_number(pattern, does_match): }, "required": ["user_id", "name"], }, - f'\\{{[\\n ]*"user_id"[\\n ]*:[\\n ]*{INTEGER}[\\n ]*,[\\n ]*"name"[\\n ]*:[\\n ]*{STRING}[\\n ]*,[\\n ]*"a"[\\n ]*:[\\n ]*{STRING}[\\n ]*\\}}', + { + "_properties_name": "(?&__string__)", + "__self__": rf'\{{{WHITESPACE}"user_id"{WHITESPACE}:{WHITESPACE}{INTEGER}{WHITESPACE},{WHITESPACE}"name"{WHITESPACE}:{WHITESPACE}{STRING}{WHITESPACE},{WHITESPACE}"a"{WHITESPACE}:{WHITESPACE}(?&_properties_name){WHITESPACE}\}}', + }, [('{"user_id": 100, "name": "John", "a": "Marc"}', True)], ), ( @@ -295,7 +370,10 @@ def test_match_number(pattern, does_match): }, "required": ["user_id", "name"], }, - f'\\{{[\\n ]*"user_id"[\\n ]*:[\\n ]*{INTEGER}[\\n ]*,[\\n ]*"name"[\\n ]*:[\\n ]*{STRING}[\\n ]*,[\\n ]*"name2"[\\n ]*:[\\n ]*{STRING}[\\n ]*\\}}', + { + "__defs_name": "(?&__string__)", + "__self__": rf'\{{{WHITESPACE}"user_id"{WHITESPACE}:{WHITESPACE}{INTEGER}{WHITESPACE},{WHITESPACE}"name"{WHITESPACE}:{WHITESPACE}{STRING}{WHITESPACE},{WHITESPACE}"name2"{WHITESPACE}:{WHITESPACE}(?&__defs_name){WHITESPACE}\}}', + }, [('{"user_id": 100, "name": "John", "name2": "Marc"}', True)], ), ( @@ -335,7 +413,10 @@ def test_match_number(pattern, does_match): } }, }, - f'\\{{[\\n ]*"name"[\\n ]*:[\\n ]*{STRING}[\\n ]*,[\\n ]*"last_name"[\\n ]*:[\\n ]*{STRING}[\\n ]*,[\\n ]*"address"[\\n ]*:[\\n ]*\\{{[\\n ]*"city"[\\n ]*:[\\n ]*{STRING}[\\n ]*\\}}[\\n ]*\\}}', + { + "customer__defs_address": rf'\{{{WHITESPACE}"city"{WHITESPACE}:{WHITESPACE}{STRING}{WHITESPACE}\}}', + "__self__": rf'\{{{WHITESPACE}"name"{WHITESPACE}:{WHITESPACE}{STRING}{WHITESPACE},{WHITESPACE}"last_name"{WHITESPACE}:{WHITESPACE}{STRING}{WHITESPACE},{WHITESPACE}"address"{WHITESPACE}:{WHITESPACE}(?&customer__defs_address){WHITESPACE}\}}', + }, [ ( '{"name": "John", "last_name": "Doe", "address": {"city": "Paris"}}', @@ -343,16 +424,40 @@ def test_match_number(pattern, does_match): ) ], ), + # Recursive schema + ( + { + "$id": "tree", + "title": "Rose Tree", + "type": "object", + "properties": { + "value": {"type": "integer"}, + "children": {"type": "array", "items": {"$ref": "tree"}}, + }, + }, + { + "tree": rf'\{{{WHITESPACE}"value"{WHITESPACE}:{WHITESPACE}{INTEGER}{WHITESPACE},{WHITESPACE}"children"{WHITESPACE}:{WHITESPACE}\[((?&tree)(,(?&tree))*)?\]{WHITESPACE}\}}', + "__self__": rf'\{{{WHITESPACE}"value"{WHITESPACE}:{WHITESPACE}{INTEGER}{WHITESPACE},{WHITESPACE}"children"{WHITESPACE}:{WHITESPACE}\[((?&tree)(,(?&tree))*)?\]{WHITESPACE}\}}', + }, + [ + ( + '{"value": 1, "children": [{"value": 2, "children": []}]}', + True, + ) + ], + ), ], ) -def test_match(schema, regex, examples): +def test_match(schema, definitions, examples): schema = json.dumps(schema) test_regex = build_regex_from_object(schema) - assert test_regex == regex + for name, value in definitions.items(): + assert f"(?P<{name}>{value})" in test_regex for string, does_match in examples: match = re.fullmatch(test_regex, string) if does_match: + assert match is not None assert match[0] == string assert match.span() == (0, len(string)) else: