Skip to content

Commit

Permalink
Merge pull request #511 from python-jsonschema/use-regress-for-patterns
Browse files Browse the repository at this point in the history
Use `regress` to implement JS regex usage for `pattern` and `patternProperties` + use unicode mode regexes by default
  • Loading branch information
sirosen authored Jan 8, 2025
2 parents 28714ff + 4414601 commit 8da1fef
Show file tree
Hide file tree
Showing 21 changed files with 449 additions and 97 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
exclude = .git,.tox,__pycache__,dist,.venv*,docs,build
max-line-length = 90
# black related: W503/W504 conflict, black causes E203
ignore = W503,W504,E203,B019
extend-ignore = W503,W504,E203,B019
2 changes: 1 addition & 1 deletion .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
- id: check-azure-pipelines
name: Validate Azure Pipelines
description: 'Validate Azure Pipelines config against the schema provided by Microsoft'
entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines
entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines --regex-variant nonunicode
language: python
files: ^(\.)?azure-pipelines\.(yml|yaml)$
types: [yaml]
Expand Down
14 changes: 13 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,21 @@ Unreleased
----------

.. vendor-insert-here
- Update vendored schemas (2024-12-22)
- Drop support for Python 3.8
- Rename ``--format-regex`` to ``--regex-variant`` and convert
``--format-regex`` to a deprecated alias.
It will be removed in a future release.
- Regular expression interpretation in ``"pattern"``, ``"patternProperties"``, and
``"format": "regex"`` usages now uses unicode-mode JS regular expressions by
default. (:issue:`353`)

- Use ``--regex-variant nonunicode`` to get non-unicode JS regular
expressions, the default behavior from previous versions.
- Custom validators may be impacted by the new regular expression
features. Validators are now always modified with the ``jsonschema``
library's ``extend()`` API to control the ``pattern`` and
``patternProperties`` keywords.

0.30.0
------
Expand Down
13 changes: 8 additions & 5 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,12 @@ Example usage:
# disables all three of time, date-time, and iri
--disable-formats time,date-time --disable-formats iri
``--format-regex``
``--regex-variant``
~~~~~~~~~~~~~~~~~~

Set a mode for handling of the ``"regex"`` value for ``"format"``. The modes are as
follows:
Set a mode for handling of the ``"regex"`` value for ``"format"`` and the mode
for ``"pattern"`` and ``"patternProperties"`` interpretation.
The modes are as follows:

.. list-table:: Regex Options
:widths: 15 30
Expand All @@ -196,9 +197,11 @@ follows:
* - mode
- description
* - default
- Require the regex to be valid in ECMAScript regex syntax.
- Use ECMAScript regex syntax.
* - nonunicode
- Use ECMAScript regex syntax, but without unicode escapes enabled.
* - python
- Require the regex to be valid in Python regex syntax.
- Use Python regex syntax.

Other Options
--------------
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies = [
'tomli>=2.0;python_version<"3.11"',
"ruamel.yaml==0.18.7",
"jsonschema>=4.18.0,<5.0",
"regress>=0.4.0",
"regress>=2024.11.1",
"requests<3.0",
"click>=8,<9",
]
Expand Down
7 changes: 6 additions & 1 deletion src/check_jsonschema/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ def _githubusercontent_url(owner: str, repo: str, ref: str, path: str) -> str:
"Validate Azure Pipelines config against the schema provided "
"by Microsoft"
),
"add_args": ["--data-transform", "azure-pipelines"],
"add_args": [
"--data-transform",
"azure-pipelines",
"--regex-variant",
"nonunicode",
],
"files": r"^(\.)?azure-pipelines\.(yml|yaml)$",
"types": "yaml",
},
Expand Down
11 changes: 7 additions & 4 deletions src/check_jsonschema/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .formats import FormatOptions
from .instance_loader import InstanceLoader
from .parsers import ParseError
from .regex_variants import RegexImplementation
from .reporter import Reporter
from .result import CheckResult
from .schema_loader import SchemaLoaderBase, SchemaParseError, UnsupportedUrlScheme
Expand All @@ -28,15 +29,17 @@ def __init__(
instance_loader: InstanceLoader,
reporter: Reporter,
*,
format_opts: FormatOptions | None = None,
format_opts: FormatOptions,
regex_impl: RegexImplementation,
traceback_mode: str = "short",
fill_defaults: bool = False,
) -> None:
self._schema_loader = schema_loader
self._instance_loader = instance_loader
self._reporter = reporter

self._format_opts = format_opts if format_opts is not None else FormatOptions()
self._format_opts = format_opts
self._regex_impl = regex_impl
self._traceback_mode = traceback_mode
self._fill_defaults = fill_defaults

Expand All @@ -51,12 +54,12 @@ def get_validator(
) -> jsonschema.protocols.Validator:
try:
return self._schema_loader.get_validator(
path, doc, self._format_opts, self._fill_defaults
path, doc, self._format_opts, self._regex_impl, self._fill_defaults
)
except SchemaParseError as e:
self._fail("Error: schemafile could not be parsed as JSON", e)
except jsonschema.SchemaError as e:
self._fail(f"Error: schemafile was not valid: {e}\n", e)
self._fail("Error: schemafile was not valid\n", e)
except UnsupportedUrlScheme as e:
self._fail(f"Error: {e}\n", e)
except Exception as e:
Expand Down
32 changes: 22 additions & 10 deletions src/check_jsonschema/cli/main_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@

from ..catalog import CUSTOM_SCHEMA_NAMES, SCHEMA_CATALOG
from ..checker import SchemaChecker
from ..formats import KNOWN_FORMATS, RegexVariantName
from ..formats import KNOWN_FORMATS
from ..instance_loader import InstanceLoader
from ..parsers import SUPPORTED_FILE_FORMATS
from ..regex_variants import RegexImplementation, RegexVariantName
from ..reporter import REPORTER_BY_NAME, Reporter
from ..schema_loader import (
BuiltinSchemaLoader,
Expand Down Expand Up @@ -68,10 +69,11 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
date, date-time, email, ipv4, ipv6, regex, uuid
\b
For the "regex" format, there are multiple modes which can be specified with
'--format-regex':
default | check that the string is a valid ECMAScript regex
python | check that the string is a valid python regex
For handling of regexes, there are multiple modes which can be specified with
'--regex-variant':
default | use ECMAScript regex syntax (via regress)
nonunicode | use ECMAScript regex syntax, but in non-unicode mode (via regress)
python | use python regex syntax
\b
The '--builtin-schema' flag supports the following schema names:
Expand Down Expand Up @@ -138,11 +140,18 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
)
@click.option(
"--format-regex",
hidden=True,
help="Legacy name for `--regex-variant`.",
default=None,
type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False),
)
@click.option(
"--regex-variant",
help=(
"Set the mode of format validation for regexes. "
"If `--disable-formats regex` is used, this option has no effect."
"Name of which regex dialect should be used for format checking "
"and 'pattern' matching."
),
default=RegexVariantName.default.value,
default=None,
type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False),
)
@click.option(
Expand Down Expand Up @@ -230,7 +239,8 @@ def main(
no_cache: bool,
cache_filename: str | None,
disable_formats: tuple[list[str], ...],
format_regex: t.Literal["python", "default"],
format_regex: t.Literal["python", "nonunicode", "default"] | None,
regex_variant: t.Literal["python", "nonunicode", "default"] | None,
default_filetype: t.Literal["json", "yaml", "toml", "json5"],
traceback_mode: t.Literal["full", "short"],
data_transform: t.Literal["azure-pipelines", "gitlab-ci"] | None,
Expand All @@ -243,6 +253,8 @@ def main(
) -> None:
args = ParseResult()

args.set_regex_variant(regex_variant, legacy_opt=format_regex)

args.set_schema(schemafile, builtin_schema, check_metaschema)
args.set_validator(validator_class)

Expand All @@ -257,7 +269,6 @@ def main(
else:
args.disable_formats = normalized_disable_formats

args.format_regex = RegexVariantName(format_regex)
args.disable_cache = no_cache
args.default_filetype = default_filetype
args.fill_defaults = fill_defaults
Expand Down Expand Up @@ -318,6 +329,7 @@ def build_checker(args: ParseResult) -> SchemaChecker:
instance_loader,
reporter,
format_opts=args.format_opts,
regex_impl=RegexImplementation(args.regex_variant),
traceback_mode=args.traceback_mode,
fill_defaults=args.fill_defaults,
)
Expand Down
19 changes: 16 additions & 3 deletions src/check_jsonschema/cli/parse_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import click
import jsonschema

from ..formats import FormatOptions, RegexVariantName
from ..formats import FormatOptions
from ..regex_variants import RegexImplementation, RegexVariantName
from ..transforms import Transform


Expand Down Expand Up @@ -36,12 +37,24 @@ def __init__(self) -> None:
# regex format options
self.disable_all_formats: bool = False
self.disable_formats: tuple[str, ...] = ()
self.format_regex: RegexVariantName = RegexVariantName.default
self.regex_variant: RegexVariantName = RegexVariantName.default
# error and output controls
self.verbosity: int = 1
self.traceback_mode: str = "short"
self.output_format: str = "text"

def set_regex_variant(
self,
variant_opt: t.Literal["python", "nonunicode", "default"] | None,
*,
legacy_opt: t.Literal["python", "nonunicode", "default"] | None = None,
) -> None:
variant_name: t.Literal["python", "nonunicode", "default"] | None = (
variant_opt or legacy_opt
)
if variant_name:
self.regex_variant = RegexVariantName(variant_name)

def set_schema(
self, schemafile: str | None, builtin_schema: str | None, check_metaschema: bool
) -> None:
Expand Down Expand Up @@ -82,7 +95,7 @@ def set_validator(
@property
def format_opts(self) -> FormatOptions:
return FormatOptions(
regex_impl=RegexImplementation(self.regex_variant),
enabled=not self.disable_all_formats,
regex_variant=self.format_regex,
disabled_formats=self.disable_formats,
)
60 changes: 21 additions & 39 deletions src/check_jsonschema/formats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from __future__ import annotations

import copy
import enum
import re
import typing as t

import jsonschema
import jsonschema.validators
import regress

from ..regex_variants import RegexImplementation
from .implementations import validate_rfc3339, validate_time

# all known format strings except for a selection from draft3 which have either
Expand Down Expand Up @@ -39,42 +36,16 @@
)


class RegexVariantName(enum.Enum):
default = "default"
python = "python"


class RegexImplementation:
def __init__(self, variant: RegexVariantName) -> None:
self.variant = variant

def check_format(self, instance: t.Any) -> bool:
if not isinstance(instance, str):
return True

try:
if self.variant == RegexVariantName.default:
regress.Regex(instance)
else:
re.compile(instance)
# something is wrong with RegressError getting into the published types
# needs investigation... for now, ignore the error
except (regress.RegressError, re.error): # type: ignore[attr-defined]
return False

return True


class FormatOptions:
def __init__(
self,
*,
regex_impl: RegexImplementation,
enabled: bool = True,
regex_variant: RegexVariantName = RegexVariantName.default,
disabled_formats: tuple[str, ...] = (),
) -> None:
self.enabled = enabled
self.regex_variant = regex_variant
self.regex_impl = regex_impl
self.disabled_formats = disabled_formats


Expand All @@ -95,14 +66,10 @@ def make_format_checker(
if not opts.enabled:
return None

# copy the base checker
base_checker = get_base_format_checker(schema_dialect)
checker = copy.deepcopy(base_checker)
# customize around regex checking first
checker = format_checker_for_regex_impl(opts.regex_impl)

# replace the regex check
del checker.checkers["regex"]
regex_impl = RegexImplementation(opts.regex_variant)
checker.checks("regex")(regex_impl.check_format)
# add other custom format checks
checker.checks("date-time")(validate_rfc3339)
checker.checks("time")(validate_time)

Expand All @@ -113,3 +80,18 @@ def make_format_checker(
del checker.checkers[checkname]

return checker


def format_checker_for_regex_impl(
regex_impl: RegexImplementation, schema_dialect: str | None = None
) -> jsonschema.FormatChecker:
# convert to a schema-derived format checker, and copy it
# for safe modification
base_checker = get_base_format_checker(schema_dialect)
checker = copy.deepcopy(base_checker)

# replace the regex check
del checker.checkers["regex"]
checker.checks("regex")(regex_impl.check_format)

return checker
Loading

0 comments on commit 8da1fef

Please sign in to comment.