From dd60ad210fce573ae2930794bbf8099ea736dccd Mon Sep 17 00:00:00 2001 From: Ilya Khait Date: Tue, 28 Nov 2023 15:58:03 +0100 Subject: [PATCH] AfO Register (#470) * Implement AfO Regster serialiation & repos (WiP) * Update resources & bind to context * Add tests & update * Fix test * Update search, factory & tests * Implement afo register suggestions & tests * 'Refactored by Sourcery' (#484) Co-authored-by: Sourcery AI <> * Update query & sorting * Update tests & format * Clean up * Update queries & sorting * Update & refactor query collation, use in afo register queries * Refactor * Add text + textNumber queries (search by traditionalReferences) * Add route & test * Use post * Update test * Extend fragment query for `traditionalReferences` --------- Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> --- .../application/afo_register_repository.py | 29 +++ .../domain/afo_register_record.py | 19 ++ .../mongo_afo_register_repository.py | 123 ++++++++++++ ebl/afo_register/web/afo_register_records.py | 49 +++++ ebl/afo_register/web/bootstrap.py | 21 ++ ebl/app.py | 7 +- ebl/common/query/query_collation.py | 188 ++++++++++++++++++ ebl/context.py | 2 + .../application/dictionary_service.py | 2 +- ebl/dictionary/application/word_repository.py | 10 +- ebl/dictionary/domain/dictionary_query.py | 85 -------- .../infrastructure/word_repository.py | 10 +- .../fragment_search_aggregations.py | 8 + ebl/tests/afo_register/test_afo_register.py | 94 +++++++++ .../test_afo_register_repository.py | 82 ++++++++ .../afo_register/test_afo_register_route.py | 78 ++++++++ ebl/tests/conftest.py | 10 + ebl/tests/dictionary/test_word_repository.py | 6 +- ebl/tests/factories/afo_register.py | 66 ++++++ poetry.lock | 17 ++ pyproject.toml | 1 + 21 files changed, 807 insertions(+), 100 deletions(-) create mode 100644 ebl/afo_register/application/afo_register_repository.py create mode 100644 ebl/afo_register/domain/afo_register_record.py create mode 100644 ebl/afo_register/infrastructure/mongo_afo_register_repository.py create mode 100644 ebl/afo_register/web/afo_register_records.py create mode 100644 ebl/afo_register/web/bootstrap.py create mode 100644 ebl/common/query/query_collation.py delete mode 100644 ebl/dictionary/domain/dictionary_query.py create mode 100644 ebl/tests/afo_register/test_afo_register.py create mode 100644 ebl/tests/afo_register/test_afo_register_repository.py create mode 100644 ebl/tests/afo_register/test_afo_register_route.py create mode 100644 ebl/tests/factories/afo_register.py diff --git a/ebl/afo_register/application/afo_register_repository.py b/ebl/afo_register/application/afo_register_repository.py new file mode 100644 index 000000000..cf375858b --- /dev/null +++ b/ebl/afo_register/application/afo_register_repository.py @@ -0,0 +1,29 @@ +from typing import Sequence +from abc import ABC, abstractmethod + +from ebl.afo_register.domain.afo_register_record import ( + AfoRegisterRecord, + AfoRegisterRecordSuggestion, +) + + +class AfoRegisterRepository(ABC): + @abstractmethod + def create(self, afo_register_record: AfoRegisterRecord) -> str: + ... + + @abstractmethod + def search(self, query, *args, **kwargs) -> Sequence[AfoRegisterRecord]: + ... + + @abstractmethod + def search_by_texts_and_numbers( + self, query_list: Sequence[str], *args, **kwargs + ) -> Sequence[AfoRegisterRecord]: + ... + + @abstractmethod + def search_suggestions( + self, text_query: str, *args, **kwargs + ) -> Sequence[AfoRegisterRecordSuggestion]: + ... diff --git a/ebl/afo_register/domain/afo_register_record.py b/ebl/afo_register/domain/afo_register_record.py new file mode 100644 index 000000000..9432e9475 --- /dev/null +++ b/ebl/afo_register/domain/afo_register_record.py @@ -0,0 +1,19 @@ +import attr +from typing import Sequence + + +@attr.s(frozen=True, auto_attribs=True) +class AfoRegisterRecord: + afo_number: str = "" + page: str = "" + text: str = "" + text_number: str = "" + lines_discussed: str = "" + discussed_by: str = "" + discussed_by_notes: str = "" + + +@attr.s(frozen=True, auto_attribs=True) +class AfoRegisterRecordSuggestion: + text: str = "" + text_numbers: Sequence[str] = tuple() diff --git a/ebl/afo_register/infrastructure/mongo_afo_register_repository.py b/ebl/afo_register/infrastructure/mongo_afo_register_repository.py new file mode 100644 index 000000000..cfdf9cb83 --- /dev/null +++ b/ebl/afo_register/infrastructure/mongo_afo_register_repository.py @@ -0,0 +1,123 @@ +from marshmallow import Schema, fields, post_load, EXCLUDE +from typing import cast, Sequence +from pymongo.database import Database +from natsort import natsorted +from ebl.mongo_collection import MongoCollection +from ebl.afo_register.domain.afo_register_record import ( + AfoRegisterRecord, + AfoRegisterRecordSuggestion, +) +from ebl.afo_register.application.afo_register_repository import AfoRegisterRepository +from ebl.common.query.query_collation import ( + make_query_params, +) + + +COLLECTION = "afo_register" + + +def create_search_query(query): + if "textNumber" not in query: + return query + text_number = query["textNumber"] + text_number_stripped = text_number.strip('"') + if text_number != text_number_stripped: + query["textNumber"] = text_number_stripped + else: + query["textNumber"] = {"$regex": f"^{text_number}.*", "$options": "i"} + return query + + +def cast_with_sorting( + records: Sequence[AfoRegisterRecord], +) -> Sequence[AfoRegisterRecord]: + return cast( + Sequence[AfoRegisterRecord], + natsorted(records, key=lambda record: f"${record.text} ${record.text_number}"), + ) + + +class AfoRegisterRecordSchema(Schema): + class Meta: + unknown = EXCLUDE + + afo_number = fields.String(required=True, data_key="afoNumber") + page = fields.String(required=True) + text = fields.String(required=True) + text_number = fields.String(required=True, data_key="textNumber") + lines_discussed = fields.String(data_key="linesDiscussed") + discussed_by = fields.String(data_key="discussedBy") + discussed_by_notes = fields.String(data_key="discussedByNotes") + + @post_load + def make_record(self, data, **kwargs): + return AfoRegisterRecord(**data) + + +class AfoRegisterRecordSuggestionSchema(Schema): + text = fields.String(required=True) + text_numbers = fields.List(fields.String(), required=True, data_key="textNumbers") + + @post_load + def make_suggestion(self, data, **kwargs): + data["text_numbers"] = natsorted(data["text_numbers"]) + return AfoRegisterRecordSuggestion(**data) + + +class MongoAfoRegisterRepository(AfoRegisterRepository): + def __init__(self, database: Database): + self._afo_register = MongoCollection(database, COLLECTION) + + def create(self, afo_register_record: AfoRegisterRecord) -> str: + return self._afo_register.insert_one( + AfoRegisterRecordSchema().dump(afo_register_record) + ) + + def search(self, query, *args, **kwargs) -> Sequence[AfoRegisterRecord]: + data = self._afo_register.find_many(create_search_query(query)) + records = AfoRegisterRecordSchema().load(data, many=True) + return cast_with_sorting(records) + + def search_by_texts_and_numbers( + self, query_list: Sequence[str], *args, **kwargs + ) -> Sequence[AfoRegisterRecord]: + pipeline = [ + { + "$addFields": { + "combined_field": {"$concat": ["$text", " ", "$textNumber"]} + } + }, + {"$match": {"combined_field": {"$in": query_list}}}, + {"$group": {"_id": "$_id", "document": {"$first": "$$ROOT"}}}, + {"$replaceRoot": {"newRoot": "$document"}}, + {"$project": {"combined_field": 0}}, + ] + data = self._afo_register.aggregate(pipeline) + records = AfoRegisterRecordSchema().load(data, many=True) + return cast_with_sorting(records) + + def search_suggestions( + self, text_query: str, *args, **kwargs + ) -> Sequence[AfoRegisterRecordSuggestion]: + collated_query = list(make_query_params({"text": text_query}, "afo-register"))[ + 0 + ] + pipeline = [ + {"$match": {"text": {"$regex": collated_query.value, "$options": "i"}}}, + {"$group": {"_id": "$text", "textNumbers": {"$addToSet": "$textNumber"}}}, + { + "$project": { + "text": "$_id", + "_id": 0, + "textNumbers": {"$setUnion": ["$textNumbers", []]}, + } + }, + {"$unwind": "$textNumbers"}, + {"$sort": {"textNumbers": 1}}, + {"$group": {"_id": "$text", "textNumbers": {"$push": "$textNumbers"}}}, + {"$project": {"text": "$_id", "textNumbers": "$textNumbers", "_id": 0}}, + ] + suggestions = AfoRegisterRecordSuggestionSchema().load( + self._afo_register.aggregate(pipeline), many=True + ) + return cast(Sequence[AfoRegisterRecordSuggestion], suggestions) diff --git a/ebl/afo_register/web/afo_register_records.py b/ebl/afo_register/web/afo_register_records.py new file mode 100644 index 000000000..77d87565e --- /dev/null +++ b/ebl/afo_register/web/afo_register_records.py @@ -0,0 +1,49 @@ +from falcon import Request, Response +from ebl.errors import NotFoundError + +from ebl.afo_register.application.afo_register_repository import AfoRegisterRepository +from ebl.afo_register.infrastructure.mongo_afo_register_repository import ( + AfoRegisterRecordSchema, + AfoRegisterRecordSuggestionSchema, +) + + +class AfoRegisterResource: + def __init__(self, afoRegisterRepository: AfoRegisterRepository): + self._afoRegisterRepository = afoRegisterRepository + + def on_get(self, req: Request, resp: Response) -> None: + try: + response = self._afoRegisterRepository.search(req.params) + except ValueError as error: + raise NotFoundError( + f"No AfO registry entries matching {str(req.params)} found." + ) from error + resp.media = AfoRegisterRecordSchema().dump(response, many=True) + + +class AfoRegisterTextsAndNumbersResource: + def __init__(self, afoRegisterRepository: AfoRegisterRepository): + self._afoRegisterRepository = afoRegisterRepository + + def on_post(self, req: Request, resp: Response) -> None: + try: + response = self._afoRegisterRepository.search_by_texts_and_numbers( + req.media + ) + except ValueError as error: + raise NotFoundError( + f"No AfO registry entries matching {str(req.media)} found." + ) from error + resp.media = AfoRegisterRecordSchema().dump(response, many=True) + + +class AfoRegisterSuggestionsResource: + def __init__(self, afoRegisterRepository: AfoRegisterRepository): + self._afoRegisterRepository = afoRegisterRepository + + def on_get(self, req: Request, resp: Response) -> None: + response = self._afoRegisterRepository.search_suggestions( + req.params["text_query"] + ) + resp.media = AfoRegisterRecordSuggestionSchema().dump(response, many=True) diff --git a/ebl/afo_register/web/bootstrap.py b/ebl/afo_register/web/bootstrap.py new file mode 100644 index 000000000..ca7bbfe0e --- /dev/null +++ b/ebl/afo_register/web/bootstrap.py @@ -0,0 +1,21 @@ +import falcon +from ebl.context import Context + +from ebl.afo_register.web.afo_register_records import ( + AfoRegisterResource, + AfoRegisterTextsAndNumbersResource, + AfoRegisterSuggestionsResource, +) + + +def create_afo_register_routes(api: falcon.App, context: Context): + afo_register_search = AfoRegisterResource(context.afo_register_repository) + afo_register_search_texts_and_numbers = AfoRegisterTextsAndNumbersResource( + context.afo_register_repository + ) + afo_register_suggestions_search = AfoRegisterSuggestionsResource( + context.afo_register_repository + ) + api.add_route("/afo-register", afo_register_search) + api.add_route("/afo-register/texts-numbers", afo_register_search_texts_and_numbers) + api.add_route("/afo-register/suggestions", afo_register_suggestions_search) diff --git a/ebl/app.py b/ebl/app.py index 6bae88fa2..72de42234 100644 --- a/ebl/app.py +++ b/ebl/app.py @@ -42,10 +42,14 @@ from ebl.lemmatization.web.bootstrap import create_lemmatization_routes from ebl.signs.infrastructure.mongo_sign_repository import MongoSignRepository from ebl.signs.web.bootstrap import create_signs_routes +from ebl.afo_register.web.bootstrap import create_afo_register_routes from ebl.transliteration.application.parallel_line_injector import ParallelLineInjector from ebl.transliteration.infrastructure.mongo_parallel_repository import ( MongoParallelRepository, ) +from ebl.afo_register.infrastructure.mongo_afo_register_repository import ( + MongoAfoRegisterRepository, +) from ebl.users.domain.user import Guest from ebl.users.infrastructure.auth0 import Auth0Backend from ebl.fragmentarium.infrastructure.mongo_findspot_repository import ( @@ -93,6 +97,7 @@ def create_context(): text_repository=MongoTextRepository(database), annotations_repository=MongoAnnotationsRepository(database), lemma_repository=MongoLemmaRepository(database), + afo_register_repository=MongoAfoRegisterRepository(database), findspot_repository=MongoFindspotRepository(database), custom_cache=custom_cache, cache=cache, @@ -121,6 +126,7 @@ def create_app(context: Context, issuer: str = "", audience: str = ""): create_fragmentarium_routes(api, context) create_lemmatization_routes(api, context) create_markup_route(api, context) + create_afo_register_routes(api, context) return api @@ -128,5 +134,4 @@ def create_app(context: Context, issuer: str = "", audience: str = ""): def get_app(): sentry_sdk.init(dsn=os.environ["SENTRY_DSN"], integrations=[FalconIntegration()]) context = create_context() - return create_app(context, os.environ["AUTH0_ISSUER"], os.environ["AUTH0_AUDIENCE"]) diff --git a/ebl/common/query/query_collation.py b/ebl/common/query/query_collation.py new file mode 100644 index 000000000..5ea0b4956 --- /dev/null +++ b/ebl/common/query/query_collation.py @@ -0,0 +1,188 @@ +import re +import attr +from enum import Enum +from typing import Dict, Iterable, Literal, Sequence +from urllib.parse import parse_qsl + +DataType = Literal["dictionary", "afo-register"] + + +class Fields(Enum): + DICTIONARY = { + "COLLATED_FIELDS": ["word", "meaning", "root"], + "WILDCARD_FIELDS": ["word", "root"], + "MARKDOWN_FIELDS": [], + } + AFO_REGISTER = { + "COLLATED_FIELDS": ["text"], + "WILDCARD_FIELDS": [], + "MARKDOWN_FIELDS": ["text"], + } + + @staticmethod + def findByDataType(data_type: DataType) -> Dict[str, Sequence[str]]: + if data_type == "dictionary": + return Fields.DICTIONARY.value + elif data_type == "afo-register": + return Fields.AFO_REGISTER.value + else: + raise ValueError("Invalid data type") + + @staticmethod + def use_collations(data_type: DataType, field_name: str) -> bool: + return field_name in Fields.findByDataType(data_type)["COLLATED_FIELDS"] + + @staticmethod + def use_wildcards(data_type: DataType, field_name: str) -> bool: + return field_name in Fields.findByDataType(data_type)["WILDCARD_FIELDS"] + + @staticmethod + def use_markdown_escape(data_type: DataType, field_name: str) -> bool: + return field_name in Fields.findByDataType(data_type)["MARKDOWN_FIELDS"] + + +WILDCARD_AND_COLLATION_MATCHERS: Dict[str, Dict[str, str]] = { + "any sign": {"wildcard": r"\?", "regex": r"[^\s]"}, + "any sign+": {"wildcard": r"\*", "regex": r"[^\s]*"}, + "collation S": {"wildcard": r"[s|š|ṣ|ś|σ]", "regex": r"[s|š|ṣ|ś|σ]"}, + "collation SS": {"wildcard": r"[ss|ß]", "regex": r"[ss|ß]"}, + "collation T": {"wildcard": r"[t|ṭ|τ]", "regex": r"[t|ṭ|τ]"}, + "collation D": {"wildcard": r"[d|ᵈ]", "regex": r"[d|ᵈ]"}, + "collation H": {"wildcard": r"[h|ḫ|ḥ|ʕ|ʾ|ʿ]", "regex": r"[h|ḫ|ḥ|ʕ|ʾ|ʿ]"}, + "collation C": {"wildcard": r"[c|č|ç|ć]", "regex": r"[c|č|ç|ć]"}, + "collation G": {"wildcard": r"[g|ĝ|ğ]", "regex": r"[g|ĝ|ğ]"}, + "collation K": {"wildcard": r"[k|κ]", "regex": r"[k|κ]"}, + "collation L": {"wildcard": r"[l|ł]", "regex": r"[l|ł]"}, + "collation N": {"wildcard": r"[n|ń|ň|ν]", "regex": r"[n|ń|ň|ν]"}, + "collation R": {"wildcard": r"[r|ř|ρ]", "regex": r"[r|ř|ρ]"}, + "collation Y": {"wildcard": r"[y|ý|ÿ]", "regex": r"[y|ý|ÿ]"}, + "collation X": {"wildcard": r"[x|ₓ]", "regex": r"[x|ₓ]"}, + "collation A": {"wildcard": r"[a|ā|â|á|à|ä|α]", "regex": r"[a|ā|â|á|à|ä|α]"}, + "collation E": {"wildcard": r"[e|ē|ê|é|è]", "regex": r"[e|ē|ê|é|è]"}, + "collation I": {"wildcard": r"[i|ī|î|í|ì|ï|ı|ι]", "regex": r"[i|ī|î|í|ì|ï|ı|ι]"}, + "collation U": {"wildcard": r"[u|ū|û|ú|ù|ü|ů]", "regex": r"[u|ū|û|ú|ù|ü|ů]"}, + "collation O": { + "wildcard": r"[o|ò|ó|ô|ö|ø|ō|ő|ο]", + "regex": r"[o|ò|ó|ô|ö|ø|ō|ő|ο]", + }, + "collation 0": {"wildcard": r"[0|₀|⁰|ø]", "regex": r"[0|₀|⁰|ø]"}, + "collation 1": {"wildcard": r"[1|₁|¹]", "regex": r"[1|₁|¹]"}, + "collation 2": {"wildcard": r"[2|₂|²]", "regex": r"[2|₂|²]"}, + "collation 3": {"wildcard": r"[3|₃|³]", "regex": r"[3|₃|³]"}, + "collation 4": {"wildcard": r"[4|₄|⁴]", "regex": r"[4|₄|⁴]"}, + "collation 5": {"wildcard": r"[5|₅|⁵]", "regex": r"[5|₅|⁵]"}, + "collation 6": {"wildcard": r"[6|₆|⁶]", "regex": r"[6|₆|⁶]"}, + "collation 7": {"wildcard": r"[7|₇|⁷]", "regex": r"[7|₇|⁷]"}, + "collation 8": {"wildcard": r"[8|₈|⁸]", "regex": r"[8|₈|⁸]"}, + "collation 9": {"wildcard": r"[9|₉|⁹]", "regex": r"[9|₉|⁹]"}, + "collation +": {"wildcard": r"[+|₊]", "regex": r"[+|₊]"}, +} + +markdown_escape = r"(\*|\^)*" + + +@attr.s(auto_attribs=True) +class CollatedFieldQuery: + string: str + field: str + data_type: DataType = attr.ib(default="dictionary") + use_wildcards: bool = attr.ib(default=False) + use_collations: bool = attr.ib(default=False) + use_markdown_escape: bool = attr.ib(default=False) + regexp: str = attr.ib(default="") + + def __attrs_post_init__(self) -> None: + self.string = self.string.strip(" ") + self.use_collations = Fields.use_collations( + self.data_type, self.field + ) and not re.match(r'^".+"$', self.string) + self.use_wildcards = Fields.use_wildcards(self.data_type, self.field) + self.use_markdown_escape = Fields.use_markdown_escape( + self.data_type, self.field + ) + self.string = self.string.strip('"') + self.regexp = self._make_regexp() + + @property + def value(self) -> str: + return self.regexp or re.escape(self.string) + + @property + def all_wildcards(self) -> str: + return r"|".join( + expression["wildcard"] + for expression in WILDCARD_AND_COLLATION_MATCHERS.values() + ) + + def _make_regexp(self) -> str: + regexp = r"".join( + self._wildcards_to_regexp(segment) for segment in self._segmentize() + ).replace(markdown_escape + markdown_escape, markdown_escape) + return regexp if regexp != re.escape(self.string) else "" + + def _segmentize(self) -> Iterable[str]: + return ( + segment + for segment in re.split(rf"({self.all_wildcards})", self.string) + if segment + ) + + def _is_regex(self, segment: str, type: str, expression: Dict) -> bool: + return ( + bool( + ("collation" in type and self.use_collations) + or ("collation" not in type and self.use_wildcards) + ) + if re.match(expression["wildcard"], segment) + else False + ) + + def _wildcards_to_regexp(self, segment: str) -> str: + for type, expression in WILDCARD_AND_COLLATION_MATCHERS.items(): + if not self._is_regex(segment, type, expression): + continue + return self._process_expression(segment, expression) + + return self._escape_segment(segment) + + def _process_expression(self, segment: str, expression: Dict) -> str: + regex = expression["regex"] + return ( + self._markdown_aware_regex(regex, False) + if self.use_markdown_escape + else regex + ) + + def _escape_segment(self, segment: str) -> str: + if self.use_markdown_escape: + return r"".join([self._markdown_aware_regex(char) for char in segment]) + else: + return re.escape(segment) + + def _markdown_aware_regex(self, segment: str, escape=True) -> str: + return r"".join( + [ + markdown_escape + + (re.escape(segment) if escape else segment) + + markdown_escape + ] + ) + + +def make_query_params_from_string( + query_string: str, data_type: DataType = "dictionary" +) -> Iterable[CollatedFieldQuery]: + parsed_query = parse_qsl(query_string) + query_dict = dict(parsed_query) if parsed_query else {} + return make_query_params(query_dict, data_type) + + +def make_query_params( + query_dict: dict, data_type: DataType = "dictionary" +) -> Iterable[CollatedFieldQuery]: + if "vowelClass" in query_dict: + query_dict["vowel_class"] = query_dict.pop("vowelClass") + return ( + CollatedFieldQuery(string, field, data_type) + for field, string in query_dict.items() + ) diff --git a/ebl/context.py b/ebl/context.py index cb55ea115..f5297a95a 100644 --- a/ebl/context.py +++ b/ebl/context.py @@ -25,6 +25,7 @@ from ebl.transliteration.application.transliteration_query_factory import ( TransliterationQueryFactory, ) +from ebl.afo_register.application.afo_register_repository import AfoRegisterRepository from ebl.fragmentarium.infrastructure.mongo_findspot_repository import ( MongoFindspotRepository, ) @@ -50,6 +51,7 @@ class Context: custom_cache: ChapterCache cache: Cache parallel_line_injector: ParallelLineInjector + afo_register_repository: AfoRegisterRepository def get_bibliography(self): return Bibliography(self.bibliography_repository, self.changelog) diff --git a/ebl/dictionary/application/dictionary_service.py b/ebl/dictionary/application/dictionary_service.py index c7a14ea1b..ad7be2555 100644 --- a/ebl/dictionary/application/dictionary_service.py +++ b/ebl/dictionary/application/dictionary_service.py @@ -4,7 +4,7 @@ from ebl.dictionary.application.word_repository import WordRepository from ebl.dictionary.domain.word import WordId from ebl.users.domain.user import User -from ebl.dictionary.domain.dictionary_query import make_query_params_from_string +from ebl.common.query.query_collation import make_query_params_from_string COLLECTION = "words" diff --git a/ebl/dictionary/application/word_repository.py b/ebl/dictionary/application/word_repository.py index 42c523eee..e92b937fe 100644 --- a/ebl/dictionary/application/word_repository.py +++ b/ebl/dictionary/application/word_repository.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from typing import Sequence, Optional -from ebl.dictionary.domain.dictionary_query import DictionaryFieldQuery +from ebl.common.query.query_collation import CollatedFieldQuery from ebl.dictionary.domain.word import WordId @@ -21,10 +21,10 @@ def query_by_ids(self, ids: Sequence[str]) -> Sequence: @abstractmethod def query_by_lemma_meaning_root_vowels( self, - word: Optional[DictionaryFieldQuery], - meaning: Optional[DictionaryFieldQuery], - root: Optional[DictionaryFieldQuery], - vowel_class: Optional[DictionaryFieldQuery], + word: Optional[CollatedFieldQuery], + meaning: Optional[CollatedFieldQuery], + root: Optional[CollatedFieldQuery], + vowel_class: Optional[CollatedFieldQuery], ) -> Sequence: ... diff --git a/ebl/dictionary/domain/dictionary_query.py b/ebl/dictionary/domain/dictionary_query.py deleted file mode 100644 index f3df1b0a2..000000000 --- a/ebl/dictionary/domain/dictionary_query.py +++ /dev/null @@ -1,85 +0,0 @@ -import re -import attr -from typing import Dict, Iterable -from urllib.parse import parse_qsl - -WILDCARD_FIELDS = ["word", "root"] -COLLATED_FIELDS = ["word", "meaning", "root"] - -WILDCARD_MATCHERS: Dict[str, Dict[str, str]] = { - "any sign": {"wildcard": r"\?", "regex": r"[^\s]"}, - "any sign+": {"wildcard": r"\*", "regex": r"[^\s]*"}, - "collation S": {"wildcard": r"[s|š|ṣ]", "regex": r"[s|š|ṣ]"}, - "collation T": {"wildcard": r"[t|ṭ]", "regex": r"[t|ṭ]"}, - "collation A": {"wildcard": r"[a|ā|â]", "regex": r"[a|ā|â]"}, - "collation E": {"wildcard": r"[e|ē|ê]", "regex": r"[e|ē|ê]"}, - "collation I": {"wildcard": r"[i|ī|î]", "regex": r"[i|ī|î]"}, - "collation U": {"wildcard": r"[u|ū|û]", "regex": r"[u|ū|û]"}, -} - - -@attr.s(auto_attribs=True) -class DictionaryFieldQuery: - string: str - field: str - use_wildcards: bool - use_collations: bool = attr.ib(init=False) - regexp: str = attr.ib(init=False) - - def __attrs_post_init__(self) -> None: - self.string = self.string.strip(" ") - self.use_collations = ( - not re.match(r'^".+"$', self.string) and self.field in COLLATED_FIELDS - ) - self.string = self.string.strip('"') - self.regexp = self._make_regexp() - - @property - def value(self) -> str: - return self.regexp or re.escape(self.string) - - @property - def all_wildcards(self) -> str: - return r"|".join( - expression["wildcard"] for expression in WILDCARD_MATCHERS.values() - ) - - def _make_regexp(self) -> str: - regexp = "".join( - self._wildcards_to_regexp(segment) for segment in self._segmentize() - ) - return regexp if regexp != re.escape(self.string) else "" - - def _segmentize(self) -> Iterable[str]: - return ( - segment - for segment in re.split(rf"({self.all_wildcards})", self.string) - if segment - ) - - def _is_regex(self, segment: str, type: str, expression: Dict) -> bool: - return ( - bool( - ("collation" in type and self.use_collations) - or ("collation" not in type and self.use_wildcards) - ) - if re.match(expression["wildcard"], segment) - else False - ) - - def _wildcards_to_regexp(self, segment: str) -> str: - for type, expression in WILDCARD_MATCHERS.items(): - if self._is_regex(segment, type, expression): - return expression["regex"] - return re.escape(segment) - - -def make_query_params_from_string(query_string: str) -> Iterable[DictionaryFieldQuery]: - parsed_query = parse_qsl(query_string) - query_dict = dict(parsed_query) if parsed_query else {} - if "vowelClass" in query_dict: - query_dict["vowel_class"] = query_dict.pop("vowelClass") - return ( - DictionaryFieldQuery(string, field, field in WILDCARD_FIELDS) - for field, string in query_dict.items() - ) diff --git a/ebl/dictionary/infrastructure/word_repository.py b/ebl/dictionary/infrastructure/word_repository.py index 4f2c3e2bf..554d18d6c 100644 --- a/ebl/dictionary/infrastructure/word_repository.py +++ b/ebl/dictionary/infrastructure/word_repository.py @@ -4,7 +4,7 @@ from ebl.dictionary.application.word_repository import WordRepository from ebl.dictionary.domain.word import WordId from ebl.mongo_collection import MongoCollection -from ebl.dictionary.domain.dictionary_query import DictionaryFieldQuery +from ebl.common.query.query_collation import CollatedFieldQuery COLLECTION = "words" LEMMA_SEARCH_LIMIT = 15 @@ -140,10 +140,10 @@ def query_by_ids(self, ids: Sequence[str]) -> Sequence: def query_by_lemma_meaning_root_vowels( self, - word: Optional[DictionaryFieldQuery] = None, - meaning: Optional[DictionaryFieldQuery] = None, - root: Optional[DictionaryFieldQuery] = None, - vowel_class: Optional[DictionaryFieldQuery] = None, + word: Optional[CollatedFieldQuery] = None, + meaning: Optional[CollatedFieldQuery] = None, + root: Optional[CollatedFieldQuery] = None, + vowel_class: Optional[CollatedFieldQuery] = None, ) -> Sequence: cursor = self._collection.aggregate( [ diff --git a/ebl/fragmentarium/infrastructure/fragment_search_aggregations.py b/ebl/fragmentarium/infrastructure/fragment_search_aggregations.py index f39e08454..546bf802e 100644 --- a/ebl/fragmentarium/infrastructure/fragment_search_aggregations.py +++ b/ebl/fragmentarium/infrastructure/fragment_search_aggregations.py @@ -135,6 +135,13 @@ def _filter_by_reference(self) -> Dict: } return {"references": {"$elemMatch": parameters}} + def _filter_by_traditional_references(self) -> Dict: + if traditional_references := self._query.get("traditionalReferences"): + return { + "traditionalReferences": {"$elemMatch": {"$eq": traditional_references}} + } + return {} + def _prefilter(self) -> List[Dict]: constraints = { "$and": compact( @@ -144,6 +151,7 @@ def _prefilter(self) -> List[Dict]: self._filter_by_project(), self._filter_by_script(), self._filter_by_reference(), + self._filter_by_traditional_references(), match_user_scopes(self._scopes), ] ), diff --git a/ebl/tests/afo_register/test_afo_register.py b/ebl/tests/afo_register/test_afo_register.py new file mode 100644 index 000000000..9ad836bd1 --- /dev/null +++ b/ebl/tests/afo_register/test_afo_register.py @@ -0,0 +1,94 @@ +import pytest +from ebl.afo_register.domain.afo_register_record import ( + AfoRegisterRecord, + AfoRegisterRecordSuggestion, +) +from ebl.afo_register.infrastructure.mongo_afo_register_repository import ( + AfoRegisterRecordSchema, + AfoRegisterRecordSuggestionSchema, +) +from ebl.tests.factories.afo_register import ( + AfoRegisterRecordFactory, + AfoRegisterRecordSuggestionFactory, +) + + +@pytest.fixture +def afo_register_record(): + return AfoRegisterRecordFactory.build() + + +@pytest.fixture +def afo_register_record_suggestion(): + return AfoRegisterRecordSuggestionFactory.build() + + +def test_afo_register_record_creation(afo_register_record: AfoRegisterRecord) -> None: + assert afo_register_record.afo_number is not None + assert afo_register_record.page is not None + assert afo_register_record.text is not None + assert afo_register_record.text_number is not None + assert afo_register_record.lines_discussed is not None + assert afo_register_record.discussed_by is not None + assert afo_register_record.discussed_by_notes is not None + + +def test_afo_register_record_defaults() -> None: + afo_register_record = AfoRegisterRecord() + + assert afo_register_record.page == "" + assert afo_register_record.text == "" + assert afo_register_record.text_number == "" + assert afo_register_record.lines_discussed == "" + assert afo_register_record.discussed_by == "" + assert afo_register_record.discussed_by_notes == "" + + +def test_afo_register_record_to_dict(afo_register_record: AfoRegisterRecord) -> None: + assert AfoRegisterRecordSchema().dump(afo_register_record) == { + "afoNumber": afo_register_record.afo_number, + "page": afo_register_record.page, + "text": afo_register_record.text, + "textNumber": afo_register_record.text_number, + "linesDiscussed": afo_register_record.lines_discussed, + "discussedBy": afo_register_record.discussed_by, + "discussedByNotes": afo_register_record.discussed_by_notes, + } + + +def test_afo_register_record_suggestion_to_dict( + afo_register_record_suggestion: AfoRegisterRecordSuggestion, +) -> None: + assert AfoRegisterRecordSuggestionSchema().dump(afo_register_record_suggestion) == { + "text": afo_register_record_suggestion.text, + "textNumbers": afo_register_record_suggestion.text_numbers, + } + + +def test_afo_register_record_from_dict(afo_register_record: AfoRegisterRecord) -> None: + serialized_data = AfoRegisterRecordSchema().dump(afo_register_record) + deserialized_object = AfoRegisterRecordSchema().load(serialized_data) + + assert deserialized_object.afo_number == afo_register_record.afo_number + assert deserialized_object.page == afo_register_record.page + assert deserialized_object.text == afo_register_record.text + assert deserialized_object.text_number == afo_register_record.text_number + assert deserialized_object.lines_discussed == afo_register_record.lines_discussed + assert deserialized_object.discussed_by == afo_register_record.discussed_by + assert ( + deserialized_object.discussed_by_notes == afo_register_record.discussed_by_notes + ) + + +def test_afo_register_record_suggestion_from_dict( + afo_register_record_suggestion: AfoRegisterRecordSuggestion, +) -> None: + serialized_data = AfoRegisterRecordSuggestionSchema().dump( + afo_register_record_suggestion + ) + deserialized_object = AfoRegisterRecordSuggestionSchema().load(serialized_data) + + assert deserialized_object.text == afo_register_record_suggestion.text + assert ( + deserialized_object.text_numbers == afo_register_record_suggestion.text_numbers + ) diff --git a/ebl/tests/afo_register/test_afo_register_repository.py b/ebl/tests/afo_register/test_afo_register_repository.py new file mode 100644 index 000000000..4539a3cf7 --- /dev/null +++ b/ebl/tests/afo_register/test_afo_register_repository.py @@ -0,0 +1,82 @@ +from ebl.tests.factories.afo_register import ( + AfoRegisterRecordFactory, + AfoRegisterRecordSuggestionFactory, +) +from ebl.afo_register.application.afo_register_repository import AfoRegisterRepository +from natsort import natsorted + + +def test_find_by_id(afo_register_repository: AfoRegisterRepository): + afo_register_record = AfoRegisterRecordFactory.build() + id = afo_register_repository.create(afo_register_record) + afo_register_repository.create(AfoRegisterRecordFactory.build()) + + assert afo_register_repository.search({"_id": id}) == [afo_register_record] + + +def test_find_by_afo_number_and_page(afo_register_repository: AfoRegisterRepository): + afo_register_record = AfoRegisterRecordFactory.build() + afo_register_repository.create(afo_register_record) + afo_register_repository.create(AfoRegisterRecordFactory.build()) + + assert afo_register_repository.search( + { + "afoNumber": afo_register_record.afo_number, + "page": afo_register_record.page, + } + ) == [afo_register_record] + + +def test_find_by_all_record_parameters(afo_register_repository: AfoRegisterRepository): + afo_register_record = AfoRegisterRecordFactory.build() + afo_register_repository.create(afo_register_record) + afo_register_repository.create(AfoRegisterRecordFactory.build()) + + assert afo_register_repository.search( + { + "afoNumber": afo_register_record.afo_number, + "page": afo_register_record.page, + "text": afo_register_record.text, + "textNumber": afo_register_record.text_number, + "linesDiscussed": afo_register_record.lines_discussed, + "discussedBy": afo_register_record.discussed_by, + "discussedByNotes": afo_register_record.discussed_by_notes, + } + ) == [afo_register_record] + + +def test_search_by_texts_and_numbers(afo_register_repository: AfoRegisterRepository): + record1 = AfoRegisterRecordFactory.build(text="Text1", text_number="1") + record2 = AfoRegisterRecordFactory.build(text="Text2", text_number="2") + record3 = AfoRegisterRecordFactory.build(text="Text3", text_number="3") + afo_register_repository.create(record1) + afo_register_repository.create(record2) + afo_register_repository.create(record3) + query = ["Text1 1", "Text3 3"] + results = afo_register_repository.search_by_texts_and_numbers(query) + + assert len(results) == 2 + assert record1 in results + assert record3 in results + + +def test_find_record_suggestions(afo_register_repository: AfoRegisterRepository): + afo_register_record = AfoRegisterRecordFactory.build() + another_afo_register_record = AfoRegisterRecordFactory.build( + text=afo_register_record.text + ) + afo_register_repository.create(afo_register_record) + afo_register_repository.create(another_afo_register_record) + text_numbers = natsorted( + [ + afo_register_record.text_number, + another_afo_register_record.text_number, + ] + ) + afo_register_record_suggestion = AfoRegisterRecordSuggestionFactory.build( + text=afo_register_record.text, text_numbers=text_numbers + ) + + assert afo_register_repository.search_suggestions( + afo_register_record.text[:-2], + ) == [afo_register_record_suggestion] diff --git a/ebl/tests/afo_register/test_afo_register_route.py b/ebl/tests/afo_register/test_afo_register_route.py new file mode 100644 index 000000000..eef420c76 --- /dev/null +++ b/ebl/tests/afo_register/test_afo_register_route.py @@ -0,0 +1,78 @@ +import falcon +import pytest +import json + +from ebl.afo_register.domain.afo_register_record import AfoRegisterRecord +from ebl.tests.factories.afo_register import ( + AfoRegisterRecordFactory, + AfoRegisterRecordSuggestionFactory, +) +from ebl.afo_register.application.afo_register_repository import ( + AfoRegisterRepository, +) +from ebl.afo_register.infrastructure.mongo_afo_register_repository import ( + AfoRegisterRecordSchema, + AfoRegisterRecordSuggestionSchema, +) + + +@pytest.fixture +def afo_register_record() -> AfoRegisterRecord: + return AfoRegisterRecordFactory.build() + + +def test_search_afo_register_record_route( + afo_register_record, afo_register_repository: AfoRegisterRepository, client +) -> None: + params = { + "afoNumber": afo_register_record.afo_number, + "page": afo_register_record.page, + "text": afo_register_record.text, + "textNumber": afo_register_record.text_number, + "linesDiscussed": afo_register_record.lines_discussed, + "discussedBy": afo_register_record.discussed_by, + "discussedByNotes": afo_register_record.discussed_by_notes, + } + afo_register_repository.create(afo_register_record) + get_result = client.simulate_get("/afo-register", params=params) + + assert get_result.status == falcon.HTTP_OK + assert get_result.json == [AfoRegisterRecordSchema().dump(afo_register_record)] + + +def test_search_by_texts_and_numbers_route( + afo_register_repository: AfoRegisterRepository, client +) -> None: + record1 = AfoRegisterRecordFactory.build(text="Text1", text_number="1") + record2 = AfoRegisterRecordFactory.build(text="Text2", text_number="2") + record3 = AfoRegisterRecordFactory.build(text="Text3", text_number="3") + afo_register_repository.create(record1) + afo_register_repository.create(record2) + afo_register_repository.create(record3) + get_result = client.simulate_post( + "/afo-register/texts-numbers", body=json.dumps(["Text1 1", "Text3 3"]) + ) + expected_results = [ + AfoRegisterRecordSchema().dump(record) for record in [record1, record3] + ] + + assert get_result.status == falcon.HTTP_OK + assert get_result.json == expected_results + + +def test_search_afo_register_suggestions_route( + afo_register_record, afo_register_repository: AfoRegisterRepository, client +) -> None: + afo_register_repository.create(afo_register_record) + get_result = client.simulate_get( + "/afo-register/suggestions", + params={"text_query": afo_register_record.text[:-2]}, + ) + afo_register_record_suggestion = AfoRegisterRecordSuggestionFactory.build( + text=afo_register_record.text, text_numbers=[afo_register_record.text_number] + ) + + assert get_result.status == falcon.HTTP_OK + assert get_result.json == [ + AfoRegisterRecordSuggestionSchema().dump(afo_register_record_suggestion) + ] diff --git a/ebl/tests/conftest.py b/ebl/tests/conftest.py index a9c84b07d..c108a67f1 100644 --- a/ebl/tests/conftest.py +++ b/ebl/tests/conftest.py @@ -75,6 +75,9 @@ from ebl.transliteration.infrastructure.mongo_parallel_repository import ( MongoParallelRepository, ) +from ebl.afo_register.infrastructure.mongo_afo_register_repository import ( + MongoAfoRegisterRepository, +) from ebl.users.domain.user import Guest, User from ebl.users.infrastructure.auth0 import Auth0User from ebl.fragmentarium.web.annotations import AnnotationResource @@ -259,6 +262,11 @@ def fragment_updater( ) +@pytest.fixture +def afo_register_repository(database): + return MongoAfoRegisterRepository(database) + + class FakeFile(File): def __init__(self, filename: str, data: bytes, metadata: dict): self.filename = filename @@ -420,6 +428,7 @@ def context( bibliography_repository, annotations_repository, lemma_repository, + afo_register_repository, findspot_repository, user, parallel_line_injector, @@ -440,6 +449,7 @@ def context( text_repository=text_repository, annotations_repository=annotations_repository, lemma_repository=lemma_repository, + afo_register_repository=afo_register_repository, findspot_repository=findspot_repository, cache=Cache({"CACHE_TYPE": "null"}), custom_cache=ChapterCache(mongo_cache_repository), diff --git a/ebl/tests/dictionary/test_word_repository.py b/ebl/tests/dictionary/test_word_repository.py index fa521b1a4..38a8862db 100644 --- a/ebl/tests/dictionary/test_word_repository.py +++ b/ebl/tests/dictionary/test_word_repository.py @@ -4,14 +4,14 @@ from typing import Dict from urllib.parse import urlencode -from ebl.dictionary.domain.dictionary_query import DictionaryFieldQuery +from ebl.common.query.query_collation import CollatedFieldQuery from ebl.errors import NotFoundError -from ebl.dictionary.domain.dictionary_query import make_query_params_from_string +from ebl.common.query.query_collation import make_query_params_from_string COLLECTION = "words" -def _make_query_params(query: Dict) -> Dict[str, DictionaryFieldQuery]: +def _make_query_params(query: Dict) -> Dict[str, CollatedFieldQuery]: return { param.field: param for param in make_query_params_from_string(urlencode(query)) diff --git a/ebl/tests/factories/afo_register.py b/ebl/tests/factories/afo_register.py new file mode 100644 index 000000000..f6935d21f --- /dev/null +++ b/ebl/tests/factories/afo_register.py @@ -0,0 +1,66 @@ +import factory +from ebl.afo_register.domain.afo_register_record import ( + AfoRegisterRecord, + AfoRegisterRecordSuggestion, +) +from faker import Faker +from typing import Sequence +from natsort import natsorted + +PUBLICATIONS = ["StOr", "Al.T.", "OECT", "VS", "STT", "CCT", "CM", "CST", "SAAB"] + +fake = Faker() + + +def get_afo_number() -> str: + return f"AfO {fake.random_int(min=10, max=52)}" + + +def get_page() -> str: + return str(fake.random_int(min=1, max=700)) + + +def get_text() -> str: + return f"{fake.random_element(elements=PUBLICATIONS)}, {fake.random_int(min=1, max=40)}" + + +def get_text_number() -> str: + return f"Nr. {fake.random_int(min=1, max=300)}" + + +def get_text_numbers() -> Sequence[str]: + return [get_text_number() for _ in range(0, fake.random_int(min=1, max=15))] + + +def get_lines_discussed() -> str: + return f"{fake.random_int(min=1, max=40)}f." + + +def get_discussed_by() -> str: + return ( + f"{fake.last_name()}, " + f"{fake.random_element(elements=PUBLICATIONS)}, " + f"{fake.random_int(min=1, max=40)}, " + f"{fake.random_int(min=1, max=50)}" + ) + + +class AfoRegisterRecordFactory(factory.Factory): + class Meta: + model = AfoRegisterRecord + + afo_number = factory.LazyAttribute(lambda obj: get_afo_number()) + page = factory.LazyAttribute(lambda obj: get_page()) + text = factory.LazyAttribute(lambda obj: get_text()) + text_number = factory.LazyAttribute(lambda obj: get_text_number()) + lines_discussed = factory.LazyAttribute(lambda obj: get_lines_discussed()) + discussed_by = factory.LazyAttribute(lambda obj: get_discussed_by()) + discussed_by_notes = factory.Faker("sentence") + + +class AfoRegisterRecordSuggestionFactory(factory.Factory): + class Meta: + model = AfoRegisterRecordSuggestion + + text = factory.LazyAttribute(lambda obj: get_text()) + text_numbers = factory.LazyAttribute(lambda obj: natsorted(get_text_numbers())) diff --git a/poetry.lock b/poetry.lock index db36f631f..b29eacfd0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1068,6 +1068,22 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "natsort" +version = "8.4.0" +description = "Simple yet flexible natural sorting in Python." +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c"}, + {file = "natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581"}, +] + +[package.extras] +fast = ["fastnumbers (>=2.0.0)"] +icu = ["PyICU (>=1.0.0)"] + [[package]] name = "packaging" version = "23.1" @@ -2026,5 +2042,6 @@ files = [ [metadata] lock-version = "2.0" + python-versions = "3.9.*" content-hash = "93ce1c17d3f17df5b730911f7c14aa6547a578505bd37c7c69623c8a46be5f2d" diff --git a/pyproject.toml b/pyproject.toml index b21d86d7e..8ba6301cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ althaia = "^3.14.1-alpha.2" cairosvg = "^2.7.0" pyyaml = "6.0.1" python-dateutil = "^2.8.2" +natsort = "^8.4.0" [tool.poetry.dev-dependencies]