From e0fc7e9e9a0612b83663871144e7515fac91515e Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Fri, 9 Aug 2024 09:49:57 -0400 Subject: [PATCH] Add converter to dedupe lists in TimdexRecord Why these changes are being introduced: * Reduce duplication of data for TimdexRecord attributes of list type. How this addresses that need: * Create an attrs converter function to dedupe list of items * Add __hash__ methods for custom classes * Apply dedupe methods to derived fields Side effects of this change: * Deduplication is highly likely to result in diffs when comparing transformed records before and after this change. That said, we are improving our transformation pipeline by removing duplicated data. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-332 --- transmogrifier/models.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/transmogrifier/models.py b/transmogrifier/models.py index f3ccc0b..1dcac04 100644 --- a/transmogrifier/models.py +++ b/transmogrifier/models.py @@ -35,6 +35,12 @@ def list_of(item_type: Any) -> Callable: # noqa: ANN401 ) +def dedupe(item_list: list | None) -> list: + if item_list is None: + return item_list + return list(set(item_list)) + + def not_empty( _instance: "TimdexRecord", attribute: "attrs.Attribute", value: "list" ) -> None: @@ -77,6 +83,9 @@ class Date: ) value: str | None = field(default=None, validator=optional(instance_of(str))) + def __hash__(self) -> int: + return hash((self.kind, self.note, self.range, self.value)) + @define class Funder: @@ -120,6 +129,9 @@ class Location: kind: str | None = field(default=None, validator=optional(instance_of(str))) geoshape: str | None = field(default=None, validator=optional(instance_of(str))) + def __hash__(self) -> int: + return hash((self.value, self.kind, self.geoshape)) + @define class Note: @@ -169,12 +181,16 @@ class TimdexRecord: ) call_numbers: list[str] | None = field(default=None, validator=optional(list_of(str))) citation: str | None = field(default=None, validator=optional(instance_of(str))) - content_type: list[str] | None = field(default=None, validator=optional(list_of(str))) + content_type: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) contents: list[str] | None = field(default=None, validator=optional(list_of(str))) contributors: list[Contributor] | None = field( default=None, validator=optional(list_of(Contributor)) ) - dates: list[Date] | None = field(default=None, validator=optional(list_of(Date))) + dates: list[Date] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Date)) + ) edition: str | None = field(default=None, validator=optional(instance_of(str))) file_formats: list[str] | None = field(default=None, validator=optional(list_of(str))) format: str | None = field(default=None, validator=optional(instance_of(str))) @@ -191,7 +207,7 @@ class TimdexRecord: links: list[Link] | None = field(default=None, validator=optional(list_of(Link))) literary_form: str | None = field(default=None, validator=optional(instance_of(str))) locations: list[Location] | None = field( - default=None, validator=optional(list_of(Location)) + default=None, converter=dedupe, validator=optional(list_of(Location)) ) notes: list[Note] | None = field(default=None, validator=optional(list_of(Note))) numbering: str | None = field(default=None, validator=optional(instance_of(str)))