Skip to content

Commit

Permalink
Deduplicate list fields in TIMDEX record
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* Improve data quality of TIMDEX records
by reducing duplication of data in list fields.

How this addresses that need:
* Create an attrs converter function to dedupe list of items
* Create ListFields abstract class with hash method
* Set hash methods in custom classes to ListFields.__hash__
* Set 'converter=dedupe' for every list field in TimdexRecord
* Add unit tests verifying deduplication of list fields

Side effects of this change:
* Deduplication is highly likely to result in diffs
when comparing transformed records before and after this
change. However (and more importantly), reducing duplicates
improves the data quality of TIMDEX records.

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/TIMX-332
  • Loading branch information
jonavellecuerdo committed Aug 12, 2024
1 parent 1910426 commit 2871769
Show file tree
Hide file tree
Showing 2 changed files with 340 additions and 41 deletions.
273 changes: 253 additions & 20 deletions tests/test_models.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,6 @@
import pytest

from transmogrifier.models import (
AlternateTitle,
Contributor,
Date,
DateRange,
Identifier,
Link,
Note,
Subject,
)
import transmogrifier.models as timdex


def test_timdex_record_required_fields_only(timdex_record_required_fields):
Expand Down Expand Up @@ -45,14 +36,16 @@ def test_timdex_record_required_fields_only(timdex_record_required_fields):


def test_timdex_record_required_subfields_only(timdex_record_required_fields):
timdex_record_required_fields.contributors = [Contributor(value="Smith, Jane")]
timdex_record_required_fields.identifiers = [Identifier(value="123")]
timdex_record_required_fields.contributors = [timdex.Contributor(value="Smith, Jane")]
timdex_record_required_fields.identifiers = [timdex.Identifier(value="123")]
timdex_record_required_fields.links = [
Link(url="http://dx.doi.org/10.1007/978-94-017-0726-8")
timdex.Link(url="http://dx.doi.org/10.1007/978-94-017-0726-8")
]
timdex_record_required_fields.notes = [Note(value=["This book is awesome"])]
timdex_record_required_fields.alternate_titles = [AlternateTitle(value="Alt Title")]
timdex_record_required_fields.subjects = [Subject(value=["Stuff"])]
timdex_record_required_fields.notes = [timdex.Note(value=["This book is awesome"])]
timdex_record_required_fields.alternate_titles = [
timdex.AlternateTitle(value="Alt Title")
]
timdex_record_required_fields.subjects = [timdex.Subject(value=["Stuff"])]
assert timdex_record_required_fields.source == "A Cool Repository"
assert timdex_record_required_fields.source_link == "https://example.com/123"
assert timdex_record_required_fields.timdex_record_id == "cool-repo:123"
Expand Down Expand Up @@ -337,8 +330,10 @@ def test_timdex_record_date_range_both_gt_and_gte_raises_error(
match="range may have a 'gt' or 'gte' value, but not both;",
):
timdex_record_required_fields.dates = [
Date(
range=DateRange(gt="2019-01-01", gte="2019-01-01", lt="2019-06-30"),
timdex.Date(
range=timdex.DateRange(
gt="2019-01-01", gte="2019-01-01", lt="2019-06-30"
),
)
]

Expand All @@ -350,8 +345,10 @@ def test_timdex_record_date_range_both_lt_and_lte_raises_error(
ValueError, match="range may have a 'lt' or 'lte' value, but not both;"
):
timdex_record_required_fields.dates = [
Date(
range=DateRange(gt="2019-01-01", lt="2019-06-30", lte="2019-06-30"),
timdex.Date(
range=timdex.DateRange(
gt="2019-01-01", lt="2019-06-30", lte="2019-06-30"
),
)
]

Expand All @@ -374,3 +371,239 @@ def test_timdex_record_not_a_list_raises_error(timdex_record_required_fields):
match="'dates' must be <class 'list'>",
):
timdex_record_required_fields.dates = "test"


def test_timdex_record_dedupe_alternate_titles(timdex_record_required_fields):
timdex_record_required_fields.alternate_titles = [
timdex.AlternateTitle(value="My Octopus Teacher"),
timdex.AlternateTitle(value="My Octopus Teacher"),
]
assert timdex_record_required_fields.alternate_titles == [
timdex.AlternateTitle(value="My Octopus Teacher")
]


def test_timdex_record_dedupe_call_numbers(timdex_record_required_fields):
timdex_record_required_fields.call_numbers = ["a", "a"]
assert timdex_record_required_fields.call_numbers == ["a"]


def test_timdex_record_dedupe_content_type(timdex_record_required_fields):
timdex_record_required_fields.content_type = ["thesis", "thesis"]
assert timdex_record_required_fields.content_type == ["thesis"]


def test_timdex_record_dedupe_contents(timdex_record_required_fields):
timdex_record_required_fields.contents = ["Chapter 1", "Chapter 1"]
assert timdex_record_required_fields.contents == ["Chapter 1"]


def test_timdex_record_dedupe_contributors(timdex_record_required_fields):
timdex_record_required_fields.contributors = [
timdex.Contributor(
value="Joe Hisaishi",
affiliation=["Kunitachi College of Music"],
kind="Composer",
),
timdex.Contributor(
value="Joe Hisaishi",
affiliation=["Kunitachi College of Music"],
kind="Composer",
),
]
assert timdex_record_required_fields.contributors == [
timdex.Contributor(
value="Joe Hisaishi",
affiliation=["Kunitachi College of Music"],
kind="Composer",
)
]


def test_timdex_record_dedupe_dates(timdex_record_required_fields):
timdex_record_required_fields.dates = [
timdex.Date(value="2022-01-01", kind="Publication date"),
timdex.Date(value="2022-01-01", kind="Publication date"),
timdex.Date(
range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"),
),
timdex.Date(
range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"),
),
]
assert timdex_record_required_fields.dates == [
timdex.Date(value="2022-01-01", kind="Publication date"),
timdex.Date(
range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"),
),
]


def test_timdex_record_dedupe_file_formats(timdex_record_required_fields):
timdex_record_required_fields.file_formats = [
"application/pdf",
"application/pdf",
]
assert timdex_record_required_fields.file_formats == ["application/pdf"]


def test_timdex_record_dedupe_funding_information(timdex_record_required_fields):
timdex_record_required_fields.funding_information = [
timdex.Funder(funder_name="NPR Foundation"),
timdex.Funder(funder_name="NPR Foundation"),
]
assert timdex_record_required_fields.funding_information == [
timdex.Funder(funder_name="NPR Foundation")
]


def test_timdex_record_dedupe_holdings(timdex_record_required_fields):
timdex_record_required_fields.holdings = [
timdex.Holding(
call_number="PL2687.L8.A28 1994",
collection="Stacks",
format="Print volume",
location="Hayden Library",
),
timdex.Holding(
call_number="PL2687.L8.A28 1994",
collection="Stacks",
format="Print volume",
location="Hayden Library",
),
]
assert timdex_record_required_fields.holdings == [
timdex.Holding(
call_number="PL2687.L8.A28 1994",
collection="Stacks",
format="Print volume",
location="Hayden Library",
)
]


def test_timdex_record_dedupe_identifiers(timdex_record_required_fields):
timdex_record_required_fields.identifiers = [
timdex.Identifier(value="9781250185969. hardcover", kind="ISBN"),
timdex.Identifier(value="9781250185969. hardcover", kind="ISBN"),
]
assert timdex_record_required_fields.identifiers == [
timdex.Identifier(value="9781250185969. hardcover", kind="ISBN")
]


def test_timdex_record_dedupe_languages(timdex_record_required_fields):
timdex_record_required_fields.languages = ["Spanish", "Spanish"]
assert timdex_record_required_fields.languages == ["Spanish"]


def test_timdex_record_dedupe_links(timdex_record_required_fields):
timdex_record_required_fields.links = [
timdex.Link(
url="https://geodata.libraries.mit.edu/record/gismit"
":GISPORTAL_GISOWNER01_BOSTONWATER95",
kind="Website",
text="Website",
),
timdex.Link(
url="https://geodata.libraries.mit.edu/record/gismit"
":GISPORTAL_GISOWNER01_BOSTONWATER95",
kind="Website",
text="Website",
),
]
assert timdex_record_required_fields.links == [
timdex.Link(
url="https://geodata.libraries.mit.edu/record/gismit"
":GISPORTAL_GISOWNER01_BOSTONWATER95",
kind="Website",
text="Website",
)
]


def test_timdex_record_dedupe_locations(timdex_record_required_fields):
timdex_record_required_fields.locations = [
timdex.Location(value="One Place", kind="Place of Publication"),
timdex.Location(value="One Place", kind="Place of Publication"),
]
assert timdex_record_required_fields.locations == [
timdex.Location(value="One Place", kind="Place of Publication")
]


def test_timdex_record_dedupe_notes(timdex_record_required_fields):
timdex_record_required_fields.notes = [
timdex.Note(value=["Survey Data"], kind="Datacite resource type"),
timdex.Note(value=["Survey Data"], kind="Datacite resource type"),
]
assert timdex_record_required_fields.notes == [
timdex.Note(value=["Survey Data"], kind="Datacite resource type"),
]


def test_timdex_record_dedupe_publication_frequency(timdex_record_required_fields):
timdex_record_required_fields.publication_frequency = [
"Three times a year",
"Three times a year",
]
assert timdex_record_required_fields.publication_frequency == ["Three times a year"]


def test_timdex_record_dedupe_publishers(timdex_record_required_fields):
timdex_record_required_fields.publishers = [
timdex.Publisher(name="Harvard Dataverse"),
timdex.Publisher(name="Harvard Dataverse"),
]
assert timdex_record_required_fields.publishers == [
timdex.Publisher(name="Harvard Dataverse")
]


def test_timdex_record_dedupe_related_items(timdex_record_required_fields):
timdex_record_required_fields.related_items = [
timdex.RelatedItem(description="Nature Communications", relationship="host"),
timdex.RelatedItem(description="Nature Communications", relationship="host"),
]
assert timdex_record_required_fields.related_items == [
timdex.RelatedItem(description="Nature Communications", relationship="host")
]


def test_timdex_record_dedupe_rights(timdex_record_required_fields):
timdex_record_required_fields.rights = [
timdex.Rights(description="MIT authentication required", kind="Access to files"),
timdex.Rights(description="MIT authentication required", kind="Access to files"),
]
assert timdex_record_required_fields.rights == [
timdex.Rights(description="MIT authentication required", kind="Access to files")
]


def test_timdex_record_dedupe_subjects(timdex_record_required_fields):
timdex_record_required_fields.subjects = [
timdex.Subject(
value=["Social Sciences", "Educational materials"],
kind="Subject scheme not provided",
),
timdex.Subject(
value=["Social Sciences", "Educational materials"],
kind="Subject scheme not provided",
),
]
assert timdex_record_required_fields.subjects == [
timdex.Subject(
value=["Social Sciences", "Educational materials"],
kind="Subject scheme not provided",
)
]


def test_timdex_record_dedupe_summary(timdex_record_required_fields):
timdex_record_required_fields.summary = [
"Mitochondria is the powerhouse of the cell.",
"Mitochondria is the powerhouse of the cell.",
]
assert timdex_record_required_fields.summary == [
"Mitochondria is the powerhouse of the cell."
]
Loading

0 comments on commit 2871769

Please sign in to comment.