Skip to content

Commit

Permalink
Merge pull request #416 from xxyzz/json_schema
Browse files Browse the repository at this point in the history
Add a script to create all JSON schema files from pydantic models
  • Loading branch information
xxyzz authored Dec 5, 2023
2 parents 805e5e9 + 9389117 commit e683ba5
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 48 deletions.
40 changes: 40 additions & 0 deletions json_schema/generate_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import importlib
import json
from importlib.resources import files


def main() -> None:
"""
Run this script at the project root folder to generate JSON schema files of
each extractor that has pydantic model `WordEntry` defined in the
`models.py` file.
"""

extractor_folder = files("wiktextract") / "extractor"
for extractor_folder in filter(
lambda p: p.is_dir(), (files("wiktextract") / "extractor").iterdir()
):
if (extractor_folder / "models.py").is_file():
lang_code = extractor_folder.stem
model_module = importlib.import_module(
f"wiktextract.extractor.{lang_code}.models"
)
model_schema = model_module.WordEntry.model_json_schema()
model_schema["$id"] = f"https://kaikki.org/{lang_code}.json"
model_schema[
"$schema"
] = "https://json-schema.org/draft/2020-12/schema"
with open(
f"json_schema/{lang_code}.json", "w", encoding="utf-8"
) as f:
json.dump(
model_schema,
f,
indent=2,
ensure_ascii=False,
sort_keys=True,
)


if __name__ == "__main__":
main()
28 changes: 5 additions & 23 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import json
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field
from pydantic.json_schema import GenerateJsonSchema


class BaseModelWrap(BaseModel):
Expand Down Expand Up @@ -99,7 +97,11 @@ class Sound(BaseModelWrap):


class WordEntry(BaseModelWrap):
"""WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract."""
"""
WordEntry is a dictionary containing lexical information of a single word
extracted from Wiktionary with wiktextract.
"""
model_config = ConfigDict(title="Spanish Wiktionary")

word: str = Field(description="word string")
pos: str = Field(default=None, description="Part of speech type")
Expand All @@ -117,23 +119,3 @@ class WordEntry(BaseModelWrap):
)
sounds: Optional[list[Sound]] = []
spellings: Optional[list[Spelling]] = []


if __name__ == "__main__":

class JsonSchemaGenerator(GenerateJsonSchema):
def generate(self, schema, mode="validation"):
json_schema = super().generate(schema, mode=mode)
json_schema["title"] = "Spanish Wiktionary"
json_schema["$id"] = "https://kaikki.org/es.json"
json_schema["$schema"] = self.schema_dialect
return json_schema

with open("json_schema/es.json", "w") as f:
json.dump(
WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator),
f,
indent=2,
ensure_ascii=False,
sort_keys=True,
)
31 changes: 6 additions & 25 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import json

from pydantic import BaseModel, ConfigDict, Field, model_validator
from pydantic.json_schema import GenerateJsonSchema
from pydantic import BaseModel, ConfigDict, Field


class BaseModelWrap(BaseModel):
model_config = ConfigDict(validate_assignment=True, extra="forbid")


class WordEntry(BaseModelWrap):
"""WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract."""
"""
WordEntry is a dictionary containing lexical information of a single
word extracted from Wiktionary with wiktextract.
"""
model_config = ConfigDict(title="Russian Wiktionary")

word: str = Field(description="word string")
pos: str = Field(default=None, description="Part of speech type")
Expand All @@ -24,23 +25,3 @@ class WordEntry(BaseModelWrap):
default=[],
description="list of non-disambiguated categories for the word",
)


if __name__ == "__main__":

class JsonSchemaGenerator(GenerateJsonSchema):
def generate(self, schema, mode="validation"):
json_schema = super().generate(schema, mode=mode)
json_schema["title"] = "Russian Wiktionary"
json_schema["$id"] = "https://kaikki.org/ru.json"
json_schema["$schema"] = self.schema_dialect
return json_schema

with open("json_schema/ru.json", "w") as f:
json.dump(
WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator),
f,
indent=2,
ensure_ascii=False,
sort_keys=True,
)
1 change: 1 addition & 0 deletions src/wiktextract/import_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib
import importlib.util
import types


Expand Down

0 comments on commit e683ba5

Please sign in to comment.