Skip to content

Commit

Permalink
refactor aggregate_latest
Browse files Browse the repository at this point in the history
fsimonjetz committed Dec 6, 2023
1 parent 6f2d893 commit ad74b4e
Showing 5 changed files with 86 additions and 44 deletions.
10 changes: 4 additions & 6 deletions ebl/fragmentarium/application/fragment_repository.py
Original file line number Diff line number Diff line change
@@ -53,12 +53,6 @@ def query_path_of_the_pioneers(
) -> List[Fragment]:
...

@abstractmethod
def query_by_transliterated_sorted_by_date(
self, user_scopes: Sequence[Scope]
) -> List[Fragment]:
...

@abstractmethod
def query_by_transliterated_not_revised_by_other(
self, user_scopes: Sequence[Scope]
@@ -95,6 +89,10 @@ def update_field(self, field: str, fragment: Fragment) -> None:
def query(self, query: dict, user_scopes: Sequence[Scope] = tuple()) -> QueryResult:
...

@abstractmethod
def query_latest(self, user_scopes: Sequence[Scope] = tuple()) -> QueryResult:
...

@abstractmethod
def fetch_scopes(self, number: MuseumNumber) -> List[Scope]:
...
45 changes: 24 additions & 21 deletions ebl/fragmentarium/infrastructure/mongo_fragment_repository.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Sequence
from typing import List, Optional, Sequence, Iterator

import pymongo
from marshmallow import EXCLUDE
@@ -49,6 +49,11 @@ def load_museum_number(data: dict) -> MuseumNumber:
return MuseumNumberSchema().load(data.get("museumNumber", data))


def load_query_result(cursor: Iterator) -> QueryResult:
data = next(cursor, None)
return QueryResultSchema().load(data) if data else QueryResult.create_empty()


class MongoFragmentRepository(FragmentRepository):
def __init__(self, database):
self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION)
@@ -255,14 +260,6 @@ def query_transliterated_line_to_vec(self) -> List[LineToVecEntry]:
for fragment in cursor
]

def query_by_transliterated_sorted_by_date(
self, user_scopes: Sequence[Scope] = tuple()
):
cursor = self._fragments.aggregate(
[*aggregate_latest(user_scopes), {"$project": {"joins": False}}]
)
return self._map_fragments(cursor)

def query_by_transliterated_not_revised_by_other(
self, user_scopes: Sequence[Scope] = tuple()
):
@@ -407,21 +404,27 @@ def _map_fragments(self, cursor) -> Sequence[Fragment]:
return FragmentSchema(unknown=EXCLUDE, many=True).load(cursor)

def query(self, query: dict, user_scopes: Sequence[Scope] = tuple()) -> QueryResult:
if set(query) - {"lemmaOperator"}:
matcher = PatternMatcher(query, user_scopes)
data = next(
self._fragments.aggregate(
matcher.build_pipeline(),
collation=Collation(
locale="en", numericOrdering=True, alternate="shifted"
),
cursor = (
self._fragments.aggregate(
PatternMatcher(query, user_scopes).build_pipeline(),
collation=Collation(
locale="en", numericOrdering=True, alternate="shifted"
),
None,
)
else:
data = None
if set(query) - {"lemmaOperator"}
else iter([])
)
return load_query_result(cursor)

return QueryResultSchema().load(data) if data else QueryResult.create_empty()
def query_latest(self, user_scopes: Sequence[Scope] = tuple()) -> QueryResult:
return load_query_result(
self._fragments.aggregate(
aggregate_latest(user_scopes),
collation=Collation(
locale="en", numericOrdering=True, alternate="shifted"
),
)
)

def list_all_fragments(
self, user_scopes: Sequence[Scope] = tuple()
66 changes: 55 additions & 11 deletions ebl/fragmentarium/infrastructure/queries.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Sequence
from typing import List, Sequence, Dict
from ebl.common.domain.accession import Accession
from ebl.common.domain.scopes import Scope
from ebl.fragmentarium.domain.archaeology import ExcavationNumber
@@ -14,9 +14,10 @@
from ebl.transliteration.infrastructure.queries import query_number_is

HAS_TRANSLITERATION: dict = {"text.lines.type": {"$exists": True}}
NUMBER_OF_LATEST_TRANSLITERATIONS: int = 50
NUMBER_OF_NEEDS_REVISION: int = 20
PATH_OF_THE_PIONEERS_MAX_UNCURATED_REFERENCES: int = 10
LATEST_TRANSLITERATION_LIMIT: int = 50
LATEST_TRANSLITERATION_LINE_LIMIT: int = 3


def fragment_is(fragment: Fragment) -> dict:
@@ -59,8 +60,10 @@ def aggregate_random(user_scopes: Sequence[Scope] = tuple()) -> List[dict]:
]


def aggregate_latest(user_scopes: Sequence[Scope] = tuple()) -> List[dict]:
temp_field_name = "_temp"
def aggregate_latest(
user_scopes: Sequence[Scope] = tuple(),
) -> List[Dict]:
tmp_record = "_tmpRecord"
return [
{
"$match": {
@@ -69,21 +72,62 @@ def aggregate_latest(user_scopes: Sequence[Scope] = tuple()) -> List[dict]:
}
},
{
"$addFields": {
temp_field_name: {
"$project": {
tmp_record: {
"$filter": {
"input": "$record",
"as": "entry",
"cond": {
"$eq": ["$$entry.type", RecordType.TRANSLITERATION.value]
"$eq": [
"$$entry.type",
RecordType.TRANSLITERATION.value,
]
},
}
}
},
"lineType": "$text.lines.type",
"museumNumber": 1,
}
},
{"$sort": {f"{tmp_record}.date": -1}},
{"$limit": LATEST_TRANSLITERATION_LIMIT},
{
"$unwind": {
"path": "$lineType",
"includeArrayIndex": "lineIndex",
}
},
{"$match": {"lineType": "TextLine"}},
{
"$group": {
"_id": "$_id",
"museumNumber": {"$first": "$museumNumber"},
"matchingLines": {"$push": "$lineIndex"},
tmp_record: {"$first": f"${tmp_record}"},
}
},
{"$sort": {f"{tmp_record}.date": -1}},
{
"$project": {
"_id": False,
"museumNumber": True,
"matchCount": {"$literal": 0},
"matchingLines": {
"$slice": [
"$matchingLines",
0,
LATEST_TRANSLITERATION_LINE_LIMIT,
]
},
}
},
{
"$group": {
"_id": None,
"items": {"$push": "$$ROOT"},
}
},
{"$sort": {f"{temp_field_name}.date": -1}},
{"$limit": NUMBER_OF_LATEST_TRANSLITERATIONS},
{"$project": {temp_field_name: 0}},
{"$project": {"_id": False, "items": 1, "matchCountTotal": {"$literal": 0}}},
]


3 changes: 1 addition & 2 deletions ebl/fragmentarium/web/fragments.py
Original file line number Diff line number Diff line change
@@ -130,8 +130,7 @@ def __init__(
def on_get(self, req: Request, resp: Response):
resp.text = json.dumps(
QueryResultSchema().dump(
self._repository.query(
{"latest": True},
self._repository.query_latest(
req.context.user.get_scopes(
prefix="read:", suffix="-fragments"
),
6 changes: 2 additions & 4 deletions ebl/tests/fragmentarium/test_fragment_repository.py
Original file line number Diff line number Diff line change
@@ -11,12 +11,10 @@
from ebl.errors import NotFoundError
from ebl.fragmentarium.application.fragment_repository import FragmentRepository
from ebl.fragmentarium.domain.record import RecordType
from ebl.fragmentarium.infrastructure.queries import LATEST_TRANSLITERATION_LINE_LIMIT
from ebl.fragmentarium.infrastructure.mongo_fragment_repository import (
MongoFragmentRepository,
)
from ebl.fragmentarium.infrastructure.fragment_search_aggregations import (
LATEST_TRANSLITERATION_LINE_LIMIT,
)
from ebl.fragmentarium.application.fragment_schema import FragmentSchema
from ebl.fragmentarium.application.joins_schema import JoinSchema
from ebl.fragmentarium.application.line_to_vec import LineToVecEntry
@@ -1130,4 +1128,4 @@ def test_query_latest(fragment_repository):
}
)

assert fragment_repository.query({"latest": True}) == expected_result
assert fragment_repository.query_latest() == expected_result

0 comments on commit ad74b4e

Please sign in to comment.