From 395e612f8887b279ccf35363694bafb06ef9a25a Mon Sep 17 00:00:00 2001 From: Jonavelle Cuerdo Date: Thu, 15 Aug 2024 11:52:59 -0400 Subject: [PATCH] Deduplicate list fields in TIMDEX record (#206) Why these changes are being introduced: * Improve data quality of TIMDEX records by reducing duplication of data in list fields. How this addresses that need: * Create an attrs converter function to dedupe list of items * Create hash method for TIMDEX objects * Set hash methods in custom classes * Set 'converter=dedupe' for every list field in TimdexRecord * Add unit tests verifying functionality of hash and dedupe methods Side effects of this change: * Deduplication is highly likely to result in diffs when comparing transformed records before and after this change. However (and more importantly), reducing duplicates improves the data quality of TIMDEX records. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-332 --- tests/test_models.py | 316 ++++++++++++++++++++++++++++++++++++--- transmogrifier/models.py | 119 ++++++++++++--- 2 files changed, 394 insertions(+), 41 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index ec2600c..4172d2c 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,15 +1,6 @@ import pytest -from transmogrifier.models import ( - AlternateTitle, - Contributor, - Date, - DateRange, - Identifier, - Link, - Note, - Subject, -) +import transmogrifier.models as timdex def test_timdex_record_required_fields_only(timdex_record_required_fields): @@ -45,14 +36,16 @@ def test_timdex_record_required_fields_only(timdex_record_required_fields): def test_timdex_record_required_subfields_only(timdex_record_required_fields): - timdex_record_required_fields.contributors = [Contributor(value="Smith, Jane")] - timdex_record_required_fields.identifiers = [Identifier(value="123")] + timdex_record_required_fields.contributors = [timdex.Contributor(value="Smith, Jane")] + timdex_record_required_fields.identifiers = [timdex.Identifier(value="123")] timdex_record_required_fields.links = [ - Link(url="http://dx.doi.org/10.1007/978-94-017-0726-8") + timdex.Link(url="http://dx.doi.org/10.1007/978-94-017-0726-8") ] - timdex_record_required_fields.notes = [Note(value=["This book is awesome"])] - timdex_record_required_fields.alternate_titles = [AlternateTitle(value="Alt Title")] - timdex_record_required_fields.subjects = [Subject(value=["Stuff"])] + timdex_record_required_fields.notes = [timdex.Note(value=["This book is awesome"])] + timdex_record_required_fields.alternate_titles = [ + timdex.AlternateTitle(value="Alt Title") + ] + timdex_record_required_fields.subjects = [timdex.Subject(value=["Stuff"])] assert timdex_record_required_fields.source == "A Cool Repository" assert timdex_record_required_fields.source_link == "https://example.com/123" assert timdex_record_required_fields.timdex_record_id == "cool-repo:123" @@ -337,8 +330,10 @@ def test_timdex_record_date_range_both_gt_and_gte_raises_error( match="range may have a 'gt' or 'gte' value, but not both;", ): timdex_record_required_fields.dates = [ - Date( - range=DateRange(gt="2019-01-01", gte="2019-01-01", lt="2019-06-30"), + timdex.Date( + range=timdex.DateRange( + gt="2019-01-01", gte="2019-01-01", lt="2019-06-30" + ), ) ] @@ -350,8 +345,10 @@ def test_timdex_record_date_range_both_lt_and_lte_raises_error( ValueError, match="range may have a 'lt' or 'lte' value, but not both;" ): timdex_record_required_fields.dates = [ - Date( - range=DateRange(gt="2019-01-01", lt="2019-06-30", lte="2019-06-30"), + timdex.Date( + range=timdex.DateRange( + gt="2019-01-01", lt="2019-06-30", lte="2019-06-30" + ), ) ] @@ -374,3 +371,282 @@ def test_timdex_record_not_a_list_raises_error(timdex_record_required_fields): match="'dates' must be ", ): timdex_record_required_fields.dates = "test" + + +def test_timdex_object_hash_diff_if_diff_class(): + """ + Asserts that TIMDEX objects of different class types + with similar attributes and attribute values will + be assigned different hashes and declared not equal. + """ + identifier = timdex.Identifier(value="x", kind="y") + alternate_title = timdex.AlternateTitle(value="x", kind="y") + assert identifier != alternate_title + assert identifier.__hash__() != alternate_title.__hash__() + + +def test_timdex_object_hash_same_if_same_class(): + """ + Asserts that TIMDEX objects of different class types + with similar attributes and attribute values will + be assigned the same hash and declared equal. + """ + identifier_0 = timdex.Identifier(value="x", kind="y") + identifier_1 = timdex.Identifier(value="x", kind="y") + assert identifier_0 == identifier_1 + assert identifier_0.__hash__() == identifier_1.__hash__() + + +def test_timdex_record_dedupe_alternate_titles(timdex_record_required_fields): + timdex_record_required_fields.alternate_titles = [ + timdex.AlternateTitle(value="My Octopus Teacher"), + timdex.AlternateTitle(value="My Octopus Teacher"), + ] + assert timdex_record_required_fields.alternate_titles == [ + timdex.AlternateTitle(value="My Octopus Teacher") + ] + + +def test_timdex_record_dedupe_call_numbers(timdex_record_required_fields): + timdex_record_required_fields.call_numbers = ["a", "a"] + assert timdex_record_required_fields.call_numbers == ["a"] + + +def test_timdex_record_dedupe_content_type(timdex_record_required_fields): + timdex_record_required_fields.content_type = ["thesis", "thesis"] + assert timdex_record_required_fields.content_type == ["thesis"] + + +def test_timdex_record_dedupe_contents(timdex_record_required_fields): + timdex_record_required_fields.contents = ["Chapter 1", "Chapter 1"] + assert timdex_record_required_fields.contents == ["Chapter 1"] + + +def test_timdex_record_dedupe_contributors(timdex_record_required_fields): + timdex_record_required_fields.contributors = [ + timdex.Contributor( + value="Joe Hisaishi", + affiliation=["Kunitachi College of Music"], + kind="Composer", + ), + timdex.Contributor( + value="Joe Hisaishi", + affiliation=["Kunitachi College of Music"], + kind="Composer", + ), + ] + assert timdex_record_required_fields.contributors == [ + timdex.Contributor( + value="Joe Hisaishi", + affiliation=["Kunitachi College of Music"], + kind="Composer", + ) + ] + + +def test_timdex_record_dedupe_dates(timdex_record_required_fields): + timdex_record_required_fields.dates = [ + timdex.Date(value="2022-01-01", kind="Publication date"), + timdex.Date(value="2022-01-01", kind="Publication date"), + timdex.Date( + range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"), + ), + timdex.Date( + range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"), + ), + ] + assert timdex_record_required_fields.dates == [ + timdex.Date(value="2022-01-01", kind="Publication date"), + timdex.Date( + range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"), + ), + ] + + +def test_timdex_record_dedupe_file_formats(timdex_record_required_fields): + timdex_record_required_fields.file_formats = [ + "application/pdf", + "application/pdf", + ] + assert timdex_record_required_fields.file_formats == ["application/pdf"] + + +def test_timdex_record_dedupe_funding_information(timdex_record_required_fields): + timdex_record_required_fields.funding_information = [ + timdex.Funder(funder_name="NPR Foundation"), + timdex.Funder(funder_name="NPR Foundation"), + ] + assert timdex_record_required_fields.funding_information == [ + timdex.Funder(funder_name="NPR Foundation") + ] + + +def test_timdex_record_dedupe_holdings(timdex_record_required_fields): + timdex_record_required_fields.holdings = [ + timdex.Holding( + call_number="PL2687.L8.A28 1994", + collection="Stacks", + format="Print volume", + location="Hayden Library", + ), + timdex.Holding( + call_number="PL2687.L8.A28 1994", + collection="Stacks", + format="Print volume", + location="Hayden Library", + ), + ] + assert timdex_record_required_fields.holdings == [ + timdex.Holding( + call_number="PL2687.L8.A28 1994", + collection="Stacks", + format="Print volume", + location="Hayden Library", + ) + ] + + +def test_timdex_record_dedupe_identifiers(timdex_record_required_fields): + timdex_record_required_fields.identifiers = [ + timdex.Identifier(value="9781250185969. hardcover", kind="ISBN"), + timdex.Identifier(value="9781250185969. hardcover", kind="ISBN"), + ] + assert timdex_record_required_fields.identifiers == [ + timdex.Identifier(value="9781250185969. hardcover", kind="ISBN") + ] + + +def test_timdex_record_dedupe_languages(timdex_record_required_fields): + timdex_record_required_fields.languages = ["Spanish", "Spanish"] + assert timdex_record_required_fields.languages == ["Spanish"] + + +def test_timdex_record_dedupe_links(timdex_record_required_fields): + timdex_record_required_fields.links = [ + timdex.Link( + url="https://geodata.libraries.mit.edu/record/gismit" + ":GISPORTAL_GISOWNER01_BOSTONWATER95", + kind="Website", + text="Website", + ), + timdex.Link( + url="https://geodata.libraries.mit.edu/record/gismit" + ":GISPORTAL_GISOWNER01_BOSTONWATER95", + kind="Website", + text="Website", + ), + ] + assert timdex_record_required_fields.links == [ + timdex.Link( + url="https://geodata.libraries.mit.edu/record/gismit" + ":GISPORTAL_GISOWNER01_BOSTONWATER95", + kind="Website", + text="Website", + ) + ] + + +def test_timdex_record_dedupe_locations(timdex_record_required_fields): + timdex_record_required_fields.locations = [ + timdex.Location(value="One Place", kind="Place of Publication"), + timdex.Location(value="One Place", kind="Place of Publication"), + ] + assert timdex_record_required_fields.locations == [ + timdex.Location(value="One Place", kind="Place of Publication") + ] + + +def test_timdex_record_dedupe_notes(timdex_record_required_fields): + timdex_record_required_fields.notes = [ + timdex.Note(value=["Survey Data"], kind="Datacite resource type"), + timdex.Note(value=["Survey Data"], kind="Datacite resource type"), + ] + assert timdex_record_required_fields.notes == [ + timdex.Note(value=["Survey Data"], kind="Datacite resource type"), + ] + + +def test_timdex_record_dedupe_publication_frequency(timdex_record_required_fields): + timdex_record_required_fields.publication_frequency = [ + "Three times a year", + "Three times a year", + ] + assert timdex_record_required_fields.publication_frequency == ["Three times a year"] + + +def test_timdex_record_dedupe_publishers(timdex_record_required_fields): + timdex_record_required_fields.publishers = [ + timdex.Publisher(name="Harvard Dataverse"), + timdex.Publisher(name="Harvard Dataverse"), + ] + assert timdex_record_required_fields.publishers == [ + timdex.Publisher(name="Harvard Dataverse") + ] + + +def test_timdex_record_dedupe_related_items(timdex_record_required_fields): + timdex_record_required_fields.related_items = [ + timdex.RelatedItem(description="Nature Communications", relationship="host"), + timdex.RelatedItem(description="Nature Communications", relationship="host"), + ] + assert timdex_record_required_fields.related_items == [ + timdex.RelatedItem(description="Nature Communications", relationship="host") + ] + + +def test_timdex_record_dedupe_rights(timdex_record_required_fields): + timdex_record_required_fields.rights = [ + timdex.Rights(description="MIT authentication required", kind="Access to files"), + timdex.Rights(description="MIT authentication required", kind="Access to files"), + ] + assert timdex_record_required_fields.rights == [ + timdex.Rights(description="MIT authentication required", kind="Access to files") + ] + + +def test_timdex_record_dedupe_subjects(timdex_record_required_fields): + timdex_record_required_fields.subjects = [ + timdex.Subject( + value=["Social Sciences", "Educational materials"], + kind="Subject scheme not provided", + ), + timdex.Subject( + value=["Social Sciences", "Educational materials"], + kind="Subject scheme not provided", + ), + ] + assert timdex_record_required_fields.subjects == [ + timdex.Subject( + value=["Social Sciences", "Educational materials"], + kind="Subject scheme not provided", + ) + ] + + +def test_timdex_record_dedupe_summary(timdex_record_required_fields): + timdex_record_required_fields.summary = [ + "Mitochondria is the powerhouse of the cell.", + "Mitochondria is the powerhouse of the cell.", + ] + assert timdex_record_required_fields.summary == [ + "Mitochondria is the powerhouse of the cell." + ] + + +def test_timdex_dedupes_correctly_if_diff_class(): + items = [ + timdex.Identifier(value="x", kind="y"), + timdex.AlternateTitle(value="x", kind="y"), + ] + assert timdex.dedupe(items) == [ + timdex.Identifier(value="x", kind="y"), + timdex.AlternateTitle(value="x", kind="y"), + ] + + +def test_timdex_dedupes_correctly_if_same_class(): + items = [ + timdex.Identifier(value="x", kind="y"), + timdex.Identifier(value="x", kind="y"), + ] + assert timdex.dedupe(items) == [timdex.Identifier(value="x", kind="y")] diff --git a/transmogrifier/models.py b/transmogrifier/models.py index f3ccc0b..949035d 100644 --- a/transmogrifier/models.py +++ b/transmogrifier/models.py @@ -43,11 +43,43 @@ def not_empty( raise ValueError(message) +def timdex_object_hash(timdex_object: Any) -> int: # noqa: ANN401 + """Hash method for TIMDEX objects. + + This method is set as the hash method for TIMDEX objects. + The method generates a unique hash using a tuple + comprised of the class name and attribute values. + By making TIMDEX objects hashable, dedupe methods + can be applied to a list of TIMDEX objects. + """ + values = (timdex_object.__class__.__name__,) + values += tuple( + [ + tuple(attrib) if isinstance(attrib, list) else attrib + for attrib in attrs.astuple(timdex_object) + ] + ) + return hash(values) + + +def dedupe(item_list: list | Any) -> list | None: # noqa: ANN401 + """Deduplication method for list of items. + + This method is used as a converter function for list fields + in the TimdexRecord model. + """ + if not isinstance(item_list, list): + return item_list + return list(dict.fromkeys(item_list)) + + @define class AlternateTitle: value: str = field(validator=instance_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Contributor: @@ -59,6 +91,8 @@ class Contributor: default=None, validator=optional(instance_of(bool)) ) + __hash__ = timdex_object_hash + @define class DateRange: @@ -73,10 +107,13 @@ class Date: kind: str | None = field(default=None, validator=optional(instance_of(str))) note: str | None = field(default=None, validator=optional(instance_of(str))) range: DateRange | None = field( # type: ignore[misc] - default=None, validator=[optional(instance_of(DateRange)), check_range] + default=None, + validator=[optional(instance_of(DateRange)), check_range], ) value: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Funder: @@ -90,6 +127,8 @@ class Funder: award_number: str | None = field(default=None, validator=optional(instance_of(str))) award_uri: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Holding: @@ -99,12 +138,16 @@ class Holding: location: str | None = field(default=None, validator=optional(instance_of(str))) note: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Identifier: value: str = field(validator=instance_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Link: @@ -113,6 +156,8 @@ class Link: restrictions: str | None = field(default=None, validator=optional(instance_of(str))) text: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Location: @@ -120,12 +165,16 @@ class Location: kind: str | None = field(default=None, validator=optional(instance_of(str))) geoshape: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Note: value: list[str] = field(validator=list_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Publisher: @@ -133,6 +182,8 @@ class Publisher: date: str | None = field(default=None, validator=optional(instance_of(str))) location: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class RelatedItem: @@ -141,6 +192,8 @@ class RelatedItem: relationship: str | None = field(default=None, validator=optional(instance_of(str))) uri: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Rights: @@ -148,12 +201,16 @@ class Rights: kind: str | None = field(default=None, validator=optional(instance_of(str))) uri: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class Subject: value: list[str] = field(validator=list_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = timdex_object_hash + @define class TimdexRecord: @@ -165,54 +222,74 @@ class TimdexRecord: # Optional fields alternate_titles: list[AlternateTitle] | None = field( - default=None, validator=optional(list_of(AlternateTitle)) + default=None, converter=dedupe, validator=optional(list_of(AlternateTitle)) + ) + call_numbers: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) ) - call_numbers: list[str] | None = field(default=None, validator=optional(list_of(str))) citation: str | None = field(default=None, validator=optional(instance_of(str))) - content_type: list[str] | None = field(default=None, validator=optional(list_of(str))) - contents: list[str] | None = field(default=None, validator=optional(list_of(str))) + content_type: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) + contents: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) contributors: list[Contributor] | None = field( - default=None, validator=optional(list_of(Contributor)) + default=None, converter=dedupe, validator=optional(list_of(Contributor)) + ) + dates: list[Date] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Date)) ) - dates: list[Date] | None = field(default=None, validator=optional(list_of(Date))) edition: str | None = field(default=None, validator=optional(instance_of(str))) - file_formats: list[str] | None = field(default=None, validator=optional(list_of(str))) + file_formats: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) format: str | None = field(default=None, validator=optional(instance_of(str))) funding_information: list[Funder] | None = field( - default=None, validator=optional(list_of(Funder)) + default=None, converter=dedupe, validator=optional(list_of(Funder)) ) holdings: list[Holding] | None = field( - default=None, validator=optional(list_of(Holding)) + default=None, converter=dedupe, validator=optional(list_of(Holding)) ) identifiers: list[Identifier] | None = field( - default=None, validator=optional(list_of(Identifier)) + default=None, converter=dedupe, validator=optional(list_of(Identifier)) + ) + languages: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) + links: list[Link] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Link)) ) - languages: list[str] | None = field(default=None, validator=optional(list_of(str))) - links: list[Link] | None = field(default=None, validator=optional(list_of(Link))) literary_form: str | None = field(default=None, validator=optional(instance_of(str))) locations: list[Location] | None = field( - default=None, validator=optional(list_of(Location)) + default=None, converter=dedupe, validator=optional(list_of(Location)) + ) + notes: list[Note] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Note)) ) - notes: list[Note] | None = field(default=None, validator=optional(list_of(Note))) numbering: str | None = field(default=None, validator=optional(instance_of(str))) physical_description: str | None = field( default=None, validator=optional(instance_of(str)) ) provider: str | None = field(default=None, validator=optional(instance_of(str))) publication_frequency: list[str] | None = field( - default=None, validator=optional(list_of(str)) + default=None, converter=dedupe, validator=optional(list_of(str)) ) publishers: list[Publisher] | None = field( - default=None, validator=optional(list_of(Publisher)) + default=None, converter=dedupe, validator=optional(list_of(Publisher)) ) related_items: list[RelatedItem] | None = field( - default=None, validator=optional(list_of(RelatedItem)) + default=None, converter=dedupe, validator=optional(list_of(RelatedItem)) + ) + rights: list[Rights] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Rights)) ) - rights: list[Rights] | None = field(default=None, validator=optional(list_of(Rights))) subjects: list[Subject] | None = field( - default=None, validator=optional(list_of(Subject)) + default=None, converter=dedupe, validator=optional(list_of(Subject)) + ) + summary: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) ) - summary: list[str] | None = field(default=None, validator=optional(list_of(str))) def asdict(self) -> dict[str, Any]: return asdict(self, filter=lambda _, value: value is not None)