Skip to content

Commit

Permalink
Add converter to dedupe lists in TimdexRecord
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* Reduce duplication of data for TimdexRecord attributes
of list type.

How this addresses that need:
* Create an attrs converter function to dedupe list of items
* Add __hash__ methods for custom classes
* Apply dedupe methods to derived fields

Side effects of this change:
* Deduplication is highly likely to result in diffs
when comparing transformed records before and after this
change. That said, we are improving our transformation
pipeline by removing duplicated data.

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/TIMX-332
  • Loading branch information
jonavellecuerdo committed Aug 9, 2024
1 parent b1ec2e7 commit e0fc7e9
Showing 1 changed file with 19 additions and 3 deletions.
22 changes: 19 additions & 3 deletions transmogrifier/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ def list_of(item_type: Any) -> Callable: # noqa: ANN401
)


def dedupe(item_list: list | None) -> list:
if item_list is None:
return item_list
return list(set(item_list))


def not_empty(
_instance: "TimdexRecord", attribute: "attrs.Attribute", value: "list"
) -> None:
Expand Down Expand Up @@ -77,6 +83,9 @@ class Date:
)
value: str | None = field(default=None, validator=optional(instance_of(str)))

def __hash__(self) -> int:
return hash((self.kind, self.note, self.range, self.value))


@define
class Funder:
Expand Down Expand Up @@ -120,6 +129,9 @@ class Location:
kind: str | None = field(default=None, validator=optional(instance_of(str)))
geoshape: str | None = field(default=None, validator=optional(instance_of(str)))

def __hash__(self) -> int:
return hash((self.value, self.kind, self.geoshape))


@define
class Note:
Expand Down Expand Up @@ -169,12 +181,16 @@ class TimdexRecord:
)
call_numbers: list[str] | None = field(default=None, validator=optional(list_of(str)))
citation: str | None = field(default=None, validator=optional(instance_of(str)))
content_type: list[str] | None = field(default=None, validator=optional(list_of(str)))
content_type: list[str] | None = field(
default=None, converter=dedupe, validator=optional(list_of(str))
)
contents: list[str] | None = field(default=None, validator=optional(list_of(str)))
contributors: list[Contributor] | None = field(
default=None, validator=optional(list_of(Contributor))
)
dates: list[Date] | None = field(default=None, validator=optional(list_of(Date)))
dates: list[Date] | None = field(
default=None, converter=dedupe, validator=optional(list_of(Date))
)
edition: str | None = field(default=None, validator=optional(instance_of(str)))
file_formats: list[str] | None = field(default=None, validator=optional(list_of(str)))
format: str | None = field(default=None, validator=optional(instance_of(str)))
Expand All @@ -191,7 +207,7 @@ class TimdexRecord:
links: list[Link] | None = field(default=None, validator=optional(list_of(Link)))
literary_form: str | None = field(default=None, validator=optional(instance_of(str)))
locations: list[Location] | None = field(
default=None, validator=optional(list_of(Location))
default=None, converter=dedupe, validator=optional(list_of(Location))
)
notes: list[Note] | None = field(default=None, validator=optional(list_of(Note)))
numbering: str | None = field(default=None, validator=optional(instance_of(str)))
Expand Down

0 comments on commit e0fc7e9

Please sign in to comment.