From 9389117f40fc158d0e15c3a99cdf184e55a3dc12 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 5 Dec 2023 15:24:44 +0800 Subject: [PATCH] Add a script to create all JSON schema files from pydantic models --- json_schema/generate_schema.py | 40 ++++++++++++++++++++++++++ src/wiktextract/extractor/es/models.py | 28 ++++-------------- src/wiktextract/extractor/ru/models.py | 31 ++++---------------- src/wiktextract/import_utils.py | 1 + 4 files changed, 52 insertions(+), 48 deletions(-) create mode 100644 json_schema/generate_schema.py diff --git a/json_schema/generate_schema.py b/json_schema/generate_schema.py new file mode 100644 index 000000000..edfaa4715 --- /dev/null +++ b/json_schema/generate_schema.py @@ -0,0 +1,40 @@ +import importlib +import json +from importlib.resources import files + + +def main() -> None: + """ + Run this script at the project root folder to generate JSON schema files of + each extractor that has pydantic model `WordEntry` defined in the + `models.py` file. + """ + + extractor_folder = files("wiktextract") / "extractor" + for extractor_folder in filter( + lambda p: p.is_dir(), (files("wiktextract") / "extractor").iterdir() + ): + if (extractor_folder / "models.py").is_file(): + lang_code = extractor_folder.stem + model_module = importlib.import_module( + f"wiktextract.extractor.{lang_code}.models" + ) + model_schema = model_module.WordEntry.model_json_schema() + model_schema["$id"] = f"https://kaikki.org/{lang_code}.json" + model_schema[ + "$schema" + ] = "https://json-schema.org/draft/2020-12/schema" + with open( + f"json_schema/{lang_code}.json", "w", encoding="utf-8" + ) as f: + json.dump( + model_schema, + f, + indent=2, + ensure_ascii=False, + sort_keys=True, + ) + + +if __name__ == "__main__": + main() diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 63d2649fb..209d00094 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -1,8 +1,6 @@ -import json from typing import Optional from pydantic import BaseModel, ConfigDict, Field -from pydantic.json_schema import GenerateJsonSchema class BaseModelWrap(BaseModel): @@ -99,7 +97,11 @@ class Sound(BaseModelWrap): class WordEntry(BaseModelWrap): - """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" + """ + WordEntry is a dictionary containing lexical information of a single word + extracted from Wiktionary with wiktextract. + """ + model_config = ConfigDict(title="Spanish Wiktionary") word: str = Field(description="word string") pos: str = Field(default=None, description="Part of speech type") @@ -117,23 +119,3 @@ class WordEntry(BaseModelWrap): ) sounds: Optional[list[Sound]] = [] spellings: Optional[list[Spelling]] = [] - - -if __name__ == "__main__": - - class JsonSchemaGenerator(GenerateJsonSchema): - def generate(self, schema, mode="validation"): - json_schema = super().generate(schema, mode=mode) - json_schema["title"] = "Spanish Wiktionary" - json_schema["$id"] = "https://kaikki.org/es.json" - json_schema["$schema"] = self.schema_dialect - return json_schema - - with open("json_schema/es.json", "w") as f: - json.dump( - WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator), - f, - indent=2, - ensure_ascii=False, - sort_keys=True, - ) diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index 3ad7b06ac..cc9f0a1f5 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -1,7 +1,4 @@ -import json - -from pydantic import BaseModel, ConfigDict, Field, model_validator -from pydantic.json_schema import GenerateJsonSchema +from pydantic import BaseModel, ConfigDict, Field class BaseModelWrap(BaseModel): @@ -9,7 +6,11 @@ class BaseModelWrap(BaseModel): class WordEntry(BaseModelWrap): - """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" + """ + WordEntry is a dictionary containing lexical information of a single + word extracted from Wiktionary with wiktextract. + """ + model_config = ConfigDict(title="Russian Wiktionary") word: str = Field(description="word string") pos: str = Field(default=None, description="Part of speech type") @@ -24,23 +25,3 @@ class WordEntry(BaseModelWrap): default=[], description="list of non-disambiguated categories for the word", ) - - -if __name__ == "__main__": - - class JsonSchemaGenerator(GenerateJsonSchema): - def generate(self, schema, mode="validation"): - json_schema = super().generate(schema, mode=mode) - json_schema["title"] = "Russian Wiktionary" - json_schema["$id"] = "https://kaikki.org/ru.json" - json_schema["$schema"] = self.schema_dialect - return json_schema - - with open("json_schema/ru.json", "w") as f: - json.dump( - WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator), - f, - indent=2, - ensure_ascii=False, - sort_keys=True, - ) diff --git a/src/wiktextract/import_utils.py b/src/wiktextract/import_utils.py index ee3d24d09..bf71a2d81 100644 --- a/src/wiktextract/import_utils.py +++ b/src/wiktextract/import_utils.py @@ -1,4 +1,5 @@ import importlib +import importlib.util import types