diff --git a/Pipfile b/Pipfile index 9f1e248..0c31ed8 100644 --- a/Pipfile +++ b/Pipfile @@ -10,6 +10,8 @@ click = "*" lxml = "*" sentry-sdk = "*" smart-open = {version = "*", extras = ["s3"]} +python-dateutil = "*" +types-python-dateutil = "*" [dev-packages] bandit = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 30be99e..892a93c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "b75c85f05210d23436b4ec39362f78429ab453490ca2973c7aa3e65c10801cea" + "sha256": "73cbd531ce329d9ebd99e42ac910bee90935e0d6f9f0d1096b4b2492790a1cb7" }, "pipfile-spec": 6, "requires": { @@ -34,34 +34,34 @@ }, "boto3": { "hashes": [ - "sha256:1f4b9c23dfcad910b6f8e74aac9fe507c1e75fcdd832e25ed2ff1e6d7a99cddf", - "sha256:92c0631ab91b4c5aa0e18a90b4d12df361723c6df1ef7e346db71f2ad0803ab3" + "sha256:0fe7a35cf0041145c8eefebd3ae2ddf41baed62d7c963e5042b8ed8c297f648f", + "sha256:e24460d50001b517c6734dcf1c879feb43aa2062d88d9bdbb8703c986cb05941" ], - "version": "==1.28.4" + "version": "==1.28.11" }, "botocore": { "hashes": [ - "sha256:1c14ac4521af707a7a407cee0e22695ce3e95c0f1a0c974e21cb25a3ce78a538", - "sha256:f9738a23b03c55c2958ebdee65273afeda80deaeefebe595887fc3251e48293a" + "sha256:b17ff973bb70b02b227928c2abe4992f1cfc46d13aee0228516c8f32572b88c6", + "sha256:d3cbffe554c9a1ba2ac6973734c43c21b8e7985a2ac4a4c31a09811b8029445c" ], "markers": "python_version >= '3.7'", - "version": "==1.31.4" + "version": "==1.31.11" }, "certifi": { "hashes": [ - "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7", - "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716" + "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082", + "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9" ], "markers": "python_version >= '3.6'", - "version": "==2023.5.7" + "version": "==2023.7.22" }, "click": { "hashes": [ - "sha256:4be4b1af8d665c6d942909916d31a213a106800c47d0eeba73d34da3cbc11367", - "sha256:e576aa487d679441d7d30abb87e1b43d24fc53bffb8758443b1a9e1cee504548" + "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd", + "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5" ], "index": "pypi", - "version": "==8.1.5" + "version": "==8.1.6" }, "jmespath": { "hashes": [ @@ -174,7 +174,7 @@ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "index": "pypi", "version": "==2.8.2" }, "s3transfer": { @@ -220,6 +220,14 @@ "markers": "python_version >= '3.7'", "version": "==2.4.1" }, + "types-python-dateutil": { + "hashes": [ + "sha256:1f4f10ac98bb8b16ade9dbee3518d9ace017821d94b057a425b069f834737f4b", + "sha256:f977b8de27787639986b4e28963263fd0e5158942b3ecef91b9335c130cb1ce9" + ], + "index": "pypi", + "version": "==2.8.19.14" + }, "urllib3": { "hashes": [ "sha256:8d36afa7616d8ab714608411b4a3b13e58f463aee519024578e062e141dce20f", @@ -268,11 +276,11 @@ }, "certifi": { "hashes": [ - "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7", - "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716" + "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082", + "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9" ], "markers": "python_version >= '3.6'", - "version": "==2023.5.7" + "version": "==2023.7.22" }, "charset-normalizer": { "hashes": [ @@ -357,11 +365,11 @@ }, "click": { "hashes": [ - "sha256:4be4b1af8d665c6d942909916d31a213a106800c47d0eeba73d34da3cbc11367", - "sha256:e576aa487d679441d7d30abb87e1b43d24fc53bffb8758443b1a9e1cee504548" + "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd", + "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5" ], "index": "pypi", - "version": "==8.1.5" + "version": "==8.1.6" }, "coverage": { "hashes": [ diff --git a/docs/adrs/0001-springshare-source-naming.md b/docs/adrs/0001-springshare-source-naming.md new file mode 100644 index 0000000..ba76f8a --- /dev/null +++ b/docs/adrs/0001-springshare-source-naming.md @@ -0,0 +1,68 @@ +# 1. Springshare Source Naming + +Date: 2023-07-26 + +## Status + +Proposed + +## Context + +While working on adding two new sources to TIMDEX pipeline, there was some discussion and constraints around what source names should be used. + +Both data sources are both from Springshare, Libguides and the AZ list of databases, and are retrieved via OAI-PMH. + +At this time, source names are a string that accompany the records throughout the TIMDEX pipeline: + * `transmogrifier`: defined in `transmogrifier.config.SOURCES` + * drives what transformer class to use + * saved to TIMDEX record as field + * used for S3 key (folder structure + filename) + * `timdex-pipeline-lambdas`: defined in `lambdas.config.INDEX_ALIASES` + * promotes a newly created index to specific aliases if configured + * used for S3 key (folder structure + filename) + * `timdex-index-manager`: defined in `tim.config.VALID_SOURCES` + * prevents indexing of sources if not present in this list + * used for index name created in OpenSearch + +Two distinct areas of consideration emerged when deciding on a source name: + * **meaningful** + * does it suggest what the original data source is? + * does it have value or meaning to end users of the API? + * **technically viable** + * does it have special characters? are they allowed? + * does it result in predictable S3 key naming conventions throughout? + * is it an allowed OpenSearch index name? + +## Decision + +The following source names were decided on: + * `libguides`: the Libguides data source + * oai set: `guides` + * `researchdatabases`: the AZ list databases + * oai set: `az` + +### `libguides` + +Pretty self-explanatory, satisfies both "meaningful" and "technically viable" requirements. + +### `researchdatabases` + +This one was a bit thornier. + +It was suggested that `az` was not terribly helpful for understanding where the data came from, and was very unhelpful for end users. + +The first agreed upon alternative was `research_databases`. `databases` was also floated, but could be ambiguous from the POV of an end user. + +For a variety of reasons, attempting to keep these words distinct in the name failed: `research_databases`, `research-databases`, and `researchDatabases`. The reasons are outlined in [this Jira ticket comments](https://mitlibraries.atlassian.net/browse/TIMX-19?focusedCommentId=107019): + * `research_databases`: index name not correctly parsed in `timdex-index-manager` + * `research-databases`: files not saved correctly to S3 in `timdex-pipeline-lambdas` + * `researchDatabases`: not a valid Opensearch index name + +And so, the final decided upon source name was `researchdatabases`; no hyphens, underscores, or camelCasing. + +## Consequences + +The source name `researchdatabases` reflects some compromises that must be made for sources: + * if the source name is meaningful to end users, it may lose fidelity about the source origin + * if the source name is technically viable, it may lose some human readability + diff --git a/tests/fixtures/oai_dc/oaidc_record_all_fields.xml b/tests/fixtures/oai_dc/oaidc_record_all_fields.xml new file mode 100644 index 0000000..0b5ebf6 --- /dev/null +++ b/tests/fixtures/oai_dc/oaidc_record_all_fields.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:creator>Ye Li</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:subject>Science</dc:subject> + <dc:description>Useful databases and other research tips for materials science.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-19 17:55:27</dc:date> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/oaidc_record_missing_required_fields.xml b/tests/fixtures/oai_dc/oaidc_record_missing_required_fields.xml new file mode 100644 index 0000000..31c4d9b --- /dev/null +++ b/tests/fixtures/oai_dc/oaidc_record_missing_required_fields.xml @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:creator>Ye Li</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:subject>Science</dc:subject> + <dc:description>Useful databases and other research tips for materials science.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-19T17:55:27</dc:date> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/oaidc_record_optional_fields_blank.xml b/tests/fixtures/oai_dc/oaidc_record_optional_fields_blank.xml new file mode 100644 index 0000000..a516bff --- /dev/null +++ b/tests/fixtures/oai_dc/oaidc_record_optional_fields_blank.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:creator></dc:creator> + <dc:subject></dc:subject> + <dc:subject></dc:subject> + <dc:description></dc:description> + <dc:publisher></dc:publisher> + <dc:date></dc:date> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/oaidc_record_optional_fields_missing.xml b/tests/fixtures/oai_dc/oaidc_record_optional_fields_missing.xml new file mode 100644 index 0000000..d7acf57 --- /dev/null +++ b/tests/fixtures/oai_dc/oaidc_record_optional_fields_missing.xml @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml b/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml new file mode 100644 index 0000000..a378095 --- /dev/null +++ b/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:creator>Ye Li</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:subject>Science</dc:subject> + <dc:description>Useful databases and other research tips for materials science.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-19T17:55:27</dc:date> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/libguides/libguides_record_all_fields.xml b/tests/fixtures/oai_dc/springshare/libguides/libguides_record_all_fields.xml new file mode 100644 index 0000000..0b5ebf6 --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/libguides/libguides_record_all_fields.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:creator>Ye Li</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:subject>Science</dc:subject> + <dc:description>Useful databases and other research tips for materials science.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-19 17:55:27</dc:date> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/libguides/libguides_record_optional_fields_blank.xml b/tests/fixtures/oai_dc/springshare/libguides/libguides_record_optional_fields_blank.xml new file mode 100644 index 0000000..a516bff --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/libguides/libguides_record_optional_fields_blank.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:creator></dc:creator> + <dc:subject></dc:subject> + <dc:subject></dc:subject> + <dc:description></dc:description> + <dc:publisher></dc:publisher> + <dc:date></dc:date> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/libguides/libguides_record_optional_fields_missing.xml b/tests/fixtures/oai_dc/springshare/libguides/libguides_record_optional_fields_missing.xml new file mode 100644 index 0000000..d7acf57 --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/libguides/libguides_record_optional_fields_missing.xml @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/libguides/libguides_records.xml b/tests/fixtures/oai_dc/springshare/libguides/libguides_records.xml new file mode 100644 index 0000000..9fb574b --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/libguides/libguides_records.xml @@ -0,0 +1,113 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:creator>Ye Li</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:subject>Science</dc:subject> + <dc:description>Useful databases and other research tips for materials science.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-19 17:55:27</dc:date> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175847</identifier> + <datestamp>2018-08-17T17:47:32Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>NTSB Reports</dc:title> + <dc:creator>Barbara Williams</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:description>A guide to library research tools for aeronautics & astronautics.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-24 04:29:10</dc:date> + <dc:identifier>https://libguides.mit.edu/c.php?g=175847</dc:identifier> + </oai_dc:dc> + </metadata> + </record> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175849</identifier> + <datestamp>2023-06-27T18:37:44Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Country Data & Analysis</dc:title> + <dc:creator>Nicholas Albaugh</dc:creator> + <dc:subject>Business & management</dc:subject> + <dc:description>This is the subject guide for Country Data & Analysis</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-26 00:51:04</dc:date> + <dc:identifier>https://libguides.mit.edu/country</dc:identifier> + </oai_dc:dc> + </metadata> + </record> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175853</identifier> + <datestamp>2023-06-12T15:30:12Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>News, Newspapers, and Current Events</dc:title> + <dc:creator>Jennifer Greenleaf</dc:creator> + <dc:subject>Interdisciplinary</dc:subject> + <dc:description>This is the subject guide for News</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-26 21:29:54</dc:date> + <dc:identifier>https://libguides.mit.edu/news</dc:identifier> + </oai_dc:dc> + </metadata> + </record> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175855</identifier> + <datestamp>2021-07-19T09:31:31Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Biography</dc:title> + <dc:creator>Tina Chan</dc:creator> + <dc:subject>Interdisciplinary</dc:subject> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-26 22:05:13</dc:date> + <dc:identifier>https://libguides.mit.edu/biography</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_all_fields.xml b/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_all_fields.xml new file mode 100644 index 0000000..1ea1751 --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_all_fields.xml @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:az/65257807</identifier> + <datestamp>2022-07-20T19:01:40Z</datestamp> + <setSpec>az</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Linguistics and Language Behavior Abstracts (LLBA)</dc:title> + <dc:subject>Humanities</dc:subject> + <dc:description>The most comprehensive index to articles in Linguistics and Language + Development and use.</dc:description> + <dc:date>2022-01-28 22:15:37</dc:date> + <dc:identifier>https://libguides.mit.edu/llba</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_optional_fields_blank.xml b/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_optional_fields_blank.xml new file mode 100644 index 0000000..e193c0e --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_optional_fields_blank.xml @@ -0,0 +1,23 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:az/65257807</identifier> + <datestamp>2022-07-20T19:01:40Z</datestamp> + <setSpec>az</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Linguistics and Language Behavior Abstracts (LLBA)</dc:title> + <dc:subject></dc:subject> + <dc:description></dc:description> + <dc:date></dc:date> + <dc:identifier>https://libguides.mit.edu/llba</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_optional_fields_missing.xml b/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_optional_fields_missing.xml new file mode 100644 index 0000000..b8e3a1d --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_optional_fields_missing.xml @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:az/65257807</identifier> + <datestamp>2022-07-20T19:01:40Z</datestamp> + <setSpec>az</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Linguistics and Language Behavior Abstracts (LLBA)</dc:title> + <dc:identifier>https://libguides.mit.edu/llba</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/research_databases/research_databases_records.xml b/tests/fixtures/oai_dc/springshare/research_databases/research_databases_records.xml new file mode 100644 index 0000000..9fb574b --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/research_databases/research_databases_records.xml @@ -0,0 +1,113 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Materials Science & Engineering</dc:title> + <dc:creator>Ye Li</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:subject>Science</dc:subject> + <dc:description>Useful databases and other research tips for materials science.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-19 17:55:27</dc:date> + <dc:identifier>https://libguides.mit.edu/materials</dc:identifier> + </oai_dc:dc> + </metadata> + </record> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175847</identifier> + <datestamp>2018-08-17T17:47:32Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>NTSB Reports</dc:title> + <dc:creator>Barbara Williams</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:description>A guide to library research tools for aeronautics & astronautics.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-24 04:29:10</dc:date> + <dc:identifier>https://libguides.mit.edu/c.php?g=175847</dc:identifier> + </oai_dc:dc> + </metadata> + </record> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175849</identifier> + <datestamp>2023-06-27T18:37:44Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Country Data & Analysis</dc:title> + <dc:creator>Nicholas Albaugh</dc:creator> + <dc:subject>Business & management</dc:subject> + <dc:description>This is the subject guide for Country Data & Analysis</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-26 00:51:04</dc:date> + <dc:identifier>https://libguides.mit.edu/country</dc:identifier> + </oai_dc:dc> + </metadata> + </record> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175853</identifier> + <datestamp>2023-06-12T15:30:12Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>News, Newspapers, and Current Events</dc:title> + <dc:creator>Jennifer Greenleaf</dc:creator> + <dc:subject>Interdisciplinary</dc:subject> + <dc:description>This is the subject guide for News</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-26 21:29:54</dc:date> + <dc:identifier>https://libguides.mit.edu/news</dc:identifier> + </oai_dc:dc> + </metadata> + </record> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175855</identifier> + <datestamp>2021-07-19T09:31:31Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:title>Biography</dc:title> + <dc:creator>Tina Chan</dc:creator> + <dc:subject>Interdisciplinary</dc:subject> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-26 22:05:13</dc:date> + <dc:identifier>https://libguides.mit.edu/biography</dc:identifier> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/springshare_invalid_dates.xml b/tests/fixtures/oai_dc/springshare/springshare_invalid_dates.xml new file mode 100644 index 0000000..4bc2d8a --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/springshare_invalid_dates.xml @@ -0,0 +1,12 @@ +<records> + <record xmlns:dc="http://purl.org/dc/elements/1.1/"> + <dc:date>200000000-01-01</dc:date> + </record> + <record xmlns:dc="http://purl.org/dc/elements/1.1/"> + <dc:date>Not found</dc:date> + </record> + <record xmlns:dc="http://purl.org/dc/elements/1.1/"> + <dc:date></dc:date> + </record> + <record xmlns:dc="http://purl.org/dc/elements/1.1/"></record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/springshare_record_missing_required_fields.xml b/tests/fixtures/oai_dc/springshare/springshare_record_missing_required_fields.xml new file mode 100644 index 0000000..31c4d9b --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/springshare_record_missing_required_fields.xml @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<records> + <record xmlns="http://www.openarchives.org/OAI/2.0/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <header> + <identifier>oai:libguides.com:guides/175846</identifier> + <datestamp>2023-05-31T19:49:21Z</datestamp> + <setSpec>guides</setSpec> + </header> + <metadata> + <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> + <dc:creator>Ye Li</dc:creator> + <dc:subject>Engineering</dc:subject> + <dc:subject>Science</dc:subject> + <dc:description>Useful databases and other research tips for materials science.</dc:description> + <dc:publisher>MIT Libraries</dc:publisher> + <dc:date>2008-06-19T17:55:27</dc:date> + </oai_dc:dc> + </metadata> + </record> +</records> \ No newline at end of file diff --git a/tests/fixtures/oai_dc/springshare/springshare_valid_dates.xml b/tests/fixtures/oai_dc/springshare/springshare_valid_dates.xml new file mode 100644 index 0000000..6f2362a --- /dev/null +++ b/tests/fixtures/oai_dc/springshare/springshare_valid_dates.xml @@ -0,0 +1,8 @@ +<records> + <record xmlns:dc="http://purl.org/dc/elements/1.1/"> + <dc:date>2000-01-01</dc:date> + </record> + <record xmlns:dc="http://purl.org/dc/elements/1.1/"> + <dc:date>January 1st, 2000</dc:date> + </record> +</records> \ No newline at end of file diff --git a/tests/test_oai_dc.py b/tests/test_oai_dc.py new file mode 100644 index 0000000..dbf8bdc --- /dev/null +++ b/tests/test_oai_dc.py @@ -0,0 +1,78 @@ +import transmogrifier.models as timdex +from transmogrifier.helpers import parse_xml_records +from transmogrifier.sources.oaidc import OaiDc + +FIXTURES_PREFIX = "tests/fixtures/oai_dc" + +BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord( + source="LibGuides", + source_link="https://libguides.mit.edu/guides/175846", + timdex_record_id="libguides:guides-175846", + title="Materials Science & Engineering", + citation="Materials Science & Engineering. libguides. " + "https://libguides.mit.edu/guides/175846", + content_type=["libguides"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") + ], +) + + +def test_oaidctransform_with_all_fields_transforms_correctly(): + input_records = parse_xml_records(f"{FIXTURES_PREFIX}/oaidc_record_all_fields.xml") + output_records = OaiDc("libguides", input_records) + assert next(output_records) == timdex.TimdexRecord( + source="LibGuides", + source_link="https://libguides.mit.edu/guides/175846", + timdex_record_id="libguides:guides-175846", + title="Materials Science & Engineering", + citation="Ye Li. Materials Science & Engineering. MIT Libraries. libguides. " + "https://libguides.mit.edu/guides/175846", + content_type=["libguides"], + contributors=[ + timdex.Contributor( + value="Ye Li", + kind="Creator", + ) + ], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") + ], + publication_information=["MIT Libraries"], + subjects=[ + timdex.Subject( + value=["Engineering", "Science"], + kind="Subject scheme not provided", + ), + ], + summary=["Useful databases and other research tips for materials science."], + ) + + +def test_oaidc_transform_with_optional_fields_blank_transforms_correctly(): + input_records = parse_xml_records( + f"{FIXTURES_PREFIX}/oaidc_record_optional_fields_blank.xml" + ) + output_records = OaiDc("libguides", input_records) + assert next(output_records) == BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + + +def test_oaidc_transform_with_optional_fields_missing_transforms_correctly(): + input_records = parse_xml_records( + f"{FIXTURES_PREFIX}/oaidc_record_optional_fields_missing.xml" + ) + output_records = OaiDc("libguides", input_records) + assert next(output_records) == BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + + +def test_oaidc_generic_date(): + input_records = parse_xml_records( + f"{FIXTURES_PREFIX}/oaidc_record_valid_generic_date.xml" + ) + transformer_instance = OaiDc("libguides", input_records) + xml = next(transformer_instance.input_records) + assert transformer_instance.get_dates("test_source_record_id", xml) == [ + timdex.Date(kind=None, note=None, range=None, value="2008-06-19T17:55:27") + ] diff --git a/tests/test_springshare.py b/tests/test_springshare.py new file mode 100644 index 0000000..f6bdfd0 --- /dev/null +++ b/tests/test_springshare.py @@ -0,0 +1,210 @@ +import transmogrifier.models as timdex +from transmogrifier.helpers import parse_xml_records +from transmogrifier.sources.springshare import SpringshareOaiDc + +SPRINGSHARE_FIXTURES_PREFIX = "tests/fixtures/oai_dc/springshare" + +LIBGUIDES_FIXTURES_PREFIX = f"{SPRINGSHARE_FIXTURES_PREFIX}/libguides" + +RESEARCHDATABASES_FIXTURES_PREFIX = f"{SPRINGSHARE_FIXTURES_PREFIX}/research_databases" + + +LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord( + source="LibGuides", + source_link="https://libguides.mit.edu/materials", + timdex_record_id="libguides:materials", + title="Materials Science & Engineering", + citation="Materials Science & Engineering. libguides. " + "https://libguides.mit.edu/materials", + content_type=["libguides"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/materials", + kind="LibGuide URL", + text="LibGuide URL", + ) + ], +) + +RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord( + source="Research Databases", + source_link="https://libguides.mit.edu/llba", + timdex_record_id="researchdatabases:llba", + title="Linguistics and Language Behavior Abstracts (LLBA)", + citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. " + "https://libguides.mit.edu/llba", + content_type=["researchdatabases"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/llba", + kind="Research Database URL", + text="Research Database URL", + ) + ], +) + + +def test_springshare_get_dates_valid(): + input_records = parse_xml_records( + f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_valid_dates.xml" + ) + transformer_instance = SpringshareOaiDc("libguides", input_records) + for xml in transformer_instance.input_records: + date_field_value = transformer_instance.get_dates("test_get_dates", xml) + assert date_field_value == [ + timdex.Date(kind=None, note=None, range=None, value="2000-01-01T00:00:00") + ] + + +def test_springshare_get_dates_invalid_logged_and_skipped(caplog): + input_records = parse_xml_records( + f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_invalid_dates.xml" + ) + transformer_instance = SpringshareOaiDc("libguides", input_records) + for xml in transformer_instance.input_records: + date_field_value = transformer_instance.get_dates("test_get_dates", xml) + assert date_field_value is None + assert "has a date that cannot be parsed" in caplog.text + + +def test_springshare_get_links_missing_identifier_logged_and_skipped(caplog): + input_records = parse_xml_records( + f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_record_missing_required_fields.xml" + ) + transformer_instance = SpringshareOaiDc("libguides", input_records) + for xml in transformer_instance.input_records: + links_field_value = transformer_instance.get_links("test_get_links", xml) + assert links_field_value is None + assert "has links that cannot be generated" in caplog.text + + +def test_libguide_transform_with_all_fields_transforms_correctly(): + input_records = parse_xml_records( + f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_all_fields.xml" + ) + output_records = SpringshareOaiDc("libguides", input_records) + assert next(output_records) == timdex.TimdexRecord( + source="LibGuides", + source_link="https://libguides.mit.edu/materials", + timdex_record_id="libguides:materials", + title="Materials Science & Engineering", + citation="Ye Li. Materials Science & Engineering. MIT Libraries. libguides. " + "https://libguides.mit.edu/materials", + content_type=["libguides"], + contributors=[ + timdex.Contributor( + value="Ye Li", + kind="Creator", + ) + ], + dates=[ + timdex.Date(value="2008-06-19T17:55:27"), + ], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/materials", + kind="LibGuide URL", + text="LibGuide URL", + ) + ], + publication_information=["MIT Libraries"], + subjects=[ + timdex.Subject( + value=["Engineering", "Science"], + kind="Subject scheme not provided", + ), + ], + summary=["Useful databases and other research tips for materials science."], + ) + + +def test_libguides_transform_with_optional_fields_blank_transforms_correctly(): + input_records = parse_xml_records( + f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_optional_fields_blank.xml" + ) + output_records = SpringshareOaiDc("libguides", input_records) + assert next(output_records) == LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + + +def test_libguides_transform_with_optional_fields_missing_transforms_correctly(): + input_records = parse_xml_records( + f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_optional_fields_missing.xml" + ) + output_records = SpringshareOaiDc("libguides", input_records) + assert next(output_records) == LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + + +def test_research_databases_transform_with_all_fields_transforms_correctly(): + input_records = parse_xml_records( + f"{RESEARCHDATABASES_FIXTURES_PREFIX}/research_databases_record_all_fields.xml" + ) + output_records = SpringshareOaiDc("researchdatabases", input_records) + assert next(output_records) == timdex.TimdexRecord( + source="Research Databases", + source_link="https://libguides.mit.edu/llba", + timdex_record_id="researchdatabases:llba", + title="Linguistics and Language Behavior Abstracts (LLBA)", + citation="Linguistics and Language Behavior Abstracts (LLBA). " + "researchdatabases. https://libguides.mit.edu/llba", + content_type=["researchdatabases"], + dates=[ + timdex.Date(value="2022-01-28T22:15:37"), + ], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/llba", + kind="Research Database URL", + text="Research Database URL", + ) + ], + subjects=[ + timdex.Subject( + value=["Humanities"], + kind="Subject scheme not provided", + ), + ], + summary=[ + "The most comprehensive index to articles in Linguistics and Language\n " + " Development and use." + ], + ) + + +def test_research_databases_transform_with_optional_fields_blank_transforms_correctly(): + input_records = parse_xml_records( + RESEARCHDATABASES_FIXTURES_PREFIX + + "/research_databases_record_optional_fields_blank.xml" + ) + output_records = SpringshareOaiDc("researchdatabases", input_records) + assert ( + next(output_records) + == RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + ) + + +def test_research_databases_transform_with_optional_fields_missing_transforms_correctly(): + input_records = parse_xml_records( + RESEARCHDATABASES_FIXTURES_PREFIX + + "/research_databases_record_optional_fields_missing.xml" + ) + output_records = SpringshareOaiDc("researchdatabases", input_records) + assert ( + next(output_records) + == RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + ) diff --git a/transmogrifier/config.py b/transmogrifier/config.py index c38317e..94fc58c 100644 --- a/transmogrifier/config.py +++ b/transmogrifier/config.py @@ -99,6 +99,16 @@ "base-url": "https://dataverse.harvard.edu/dataset.xhtml?persistentId=", "transform-class": "transmogrifier.sources.datacite.Datacite", }, + "libguides": { + "name": "LibGuides", + "base-url": "https://libguides.mit.edu/", + "transform-class": "transmogrifier.sources.springshare.SpringshareOaiDc", + }, + "researchdatabases": { + "name": "Research Databases", + "base-url": "https://libguides.mit.edu/", + "transform-class": "transmogrifier.sources.springshare.SpringshareOaiDc", + }, "whoas": { "name": "Woods Hole Open Access Server", "base-url": "https://darchive.mblwhoilibrary.org/handle/", diff --git a/transmogrifier/sources/oaidc.py b/transmogrifier/sources/oaidc.py new file mode 100644 index 0000000..df606c6 --- /dev/null +++ b/transmogrifier/sources/oaidc.py @@ -0,0 +1,179 @@ +import logging +from typing import Dict, List, Optional + +from bs4 import Tag + +import transmogrifier.models as timdex +from transmogrifier.helpers import validate_date +from transmogrifier.sources.transformer import Transformer + +logger = logging.getLogger(__name__) + + +class OaiDc(Transformer): + """ + Generic OAI DC transformer. + + While technically this transformer COULD return a valid TIMDEX model, it is + anticipated this will most likely get extended by a source-specific transformer. + """ + + def get_optional_fields(self, xml: Tag) -> Optional[dict]: + """ + Retrieve optional TIMDEX fields from a generic OAI DC XML record. + + Args: + xml: A BeautifulSoup Tag representing a single OAI DC XML record + """ + + fields: dict = {} + + # extract source_record_id early for use and logging + source_record_id = self.get_source_record_id(xml) + + # alternate_titles: not set in this transformation + + # call_numbers: not set in this transformation + + # citation: uses fallback get_citation() method + + # content_type + fields["content_type"] = [self.source] + + # contents: not set in this transformation + + # contributors + for creator in [c for c in xml.find_all("dc:creator") if c.string]: + fields.setdefault("contributors", []).append( + timdex.Contributor( + value=str(creator.string), + kind="Creator", + ) + ) + + # dates + fields["dates"] = self.get_dates(source_record_id, xml) + + # edition: not set in this transformation + + # file_formats: not set in this transformation + + # format + fields["format"] = "electronic resource" + + # funding_information: not set in this transformation + + # holdings: not set in this transformation + + # identifiers + fields.setdefault("identifiers", []).append( + timdex.Identifier( + value=str(xml.header.identifier.string), + kind="OAI-PMH", + ) + ) + + # languages: not set in this transformation + + # links + fields["links"] = self.get_links(source_record_id, xml) + + # literary_form: not set in this transformation + + # locations: not set in this transformation + + # notes: not set in this transformation + + # numbering: not set in this transformation + + # physical_description: not set in this transformation + + # publication_frequency: not set in this transformation + + # publication_information + fields["publication_information"] = [ + str(p.string) for p in xml.find_all("dc:publisher") if p.string + ] or None + + # related_items: not set in this transformation + + # rights: not set in this transformation + + # subjects + subjects_dict: Dict[str, List[str]] = {} + for subject in xml.metadata.find_all("dc:subject", string=True): + subjects_dict.setdefault("Subject scheme not provided", []).append( + str(subject.string) + ) + fields["subjects"] = [ + timdex.Subject(value=value, kind=key) + for key, value in subjects_dict.items() + ] or None + + # summary + # uses description list retrieved for notes field + for description in [d for d in xml.find_all("dc:description") if d.string]: + fields.setdefault("summary", []).append(str(description.string)) + + return fields + + def get_dates(self, source_record_id: str, xml: Tag) -> Optional[List[timdex.Date]]: + """ + Method to get TIMDEX "dates" field. This method broken out to allow subclasses + to override. + + Return list of timdex.Date's if valid and present. + + Args: + source_record_id: Source record id + xml: A BeautifulSoup Tag representing a single OAI DC XML record. + """ + + dates = [] + if date_elements := xml.find_all("dc:date", string=True): + for date in date_elements: + date_str = str(date.string.strip()) + if validate_date( + date_str, + source_record_id, + ): + dates.append(timdex.Date(value=date_str)) + return dates or None + + def get_links(self, source_record_id: str, xml: Tag) -> Optional[List[timdex.Link]]: + """ + Method to get TIMDEX "links" field. This method broken out to allow subclasses + to override. + + Args: + source_record_id: Source record id + xml: A BeautifulSoup Tag representing a single OAI DC XML record. + """ + + return None + + @classmethod + def get_main_titles(cls, xml: Tag) -> list[Tag]: + """ + Retrieve main title(s) from a generic OAI DC XML record. + + Overrides metaclass get_main_titles() method. + + Args: + xml: A BeautifulSoup Tag representing a single OAI DC XML record. + """ + return [t for t in xml.find_all("dc:title")] + + @classmethod + def get_source_record_id(cls, xml: Tag) -> str: + """ + Use OAI-PMH header identifier. It is anticipated this will likely need to get + overridden by subclasses with a meaningful identifier. + + Overrides metaclass get_source_record_id() method. + + Args: + xml: A BeautifulSoup Tag representing a single OAI DC XML record. + """ + + return xml.header.identifier.string.split(":")[-1] diff --git a/transmogrifier/sources/springshare.py b/transmogrifier/sources/springshare.py new file mode 100644 index 0000000..df95299 --- /dev/null +++ b/transmogrifier/sources/springshare.py @@ -0,0 +1,104 @@ +import logging +from typing import List, Optional + +from bs4 import Tag +from dateutil.parser import ParserError +from dateutil.parser import parse as date_parser + +import transmogrifier.models as timdex +from transmogrifier.helpers import validate_date +from transmogrifier.sources.oaidc import OaiDc + +logger = logging.getLogger(__name__) + + +class SpringshareOaiDc(OaiDc): + """ + Springshare transformer that extends generic OAI DC transformer. + + This transformer is used for: + - libguides + - researchdatabases + """ + + def get_dates(self, source_record_id: str, xml: Tag) -> Optional[List[timdex.Date]]: + """ + Overrides OaiDc's default get_dates() logic for Springshare records. + + In Springshare records the dc:date format is "YYYY-MM-DD HH:MM:SS", which is not + readily acceptable by OpenSearch, because of space instead of "T". This method + parses the date and serializes to ISO format. + + Additionally, only a single date will is expected. + + Args: + source_record_id: Source record id + xml: A BeautifulSoup Tag representing a single OAI DC XML record. + """ + + dates = [] + if date := xml.find("dc:date", string=True): + try: + date_iso_str = date_parser(str(date.string).strip()).isoformat() + except ParserError as e: + logger.debug( + "Record ID %s has a date that cannot be parsed: %s", + source_record_id, + str(e), + ) + return None + if validate_date( + date_iso_str, + source_record_id, + ): + dates.append(timdex.Date(value=date_iso_str, kind=None)) + return dates or None + + def get_links(self, source_record_id: str, xml: Tag) -> Optional[List[timdex.Link]]: + """ + Overrides OaiDc's default get_links() logic for Springshare records. + + Args: + source_record_id: Source record id + xml: A BeautifulSoup Tag representing a single OAI DC XML record. + """ + + identifier = xml.find("dc:identifier") + if identifier is None or identifier.string is None: + logger.debug( + "Record ID %s has links that cannot be generated: missing dc:identifier", + source_record_id, + ) + return None + singular_source_name = self.source_name.rstrip("s") + return [ + timdex.Link( + kind=f"{singular_source_name} URL", + text=f"{singular_source_name} URL", + url=str(identifier.string), + ) + ] + + @classmethod + def get_source_record_id(cls, xml: Tag) -> str: + """ + Get the source record ID from a Springshare OAI DC XML record. + + Overrides metaclass get_source_record_id() method. + + The URL path of the Springshare resource is used as the source record id, which + results in a timdex record id like "libguides:materials" or + "researchdatabases:llba". This is preferred over the OAI-PMH identifier, a + numeric value, which cannot be used to construct an accessible source link. + + Libguides example: + "https://libguides.mit.edu/materials" -> "materials" + + AZ (Research Database) example: + "https://libguides.mit.edu/llba" -> "llba" + + Args: + xml: A BeautifulSoup Tag representing a single Springshare OAI DC XML record. + """ + + return str(xml.find("dc:identifier").string).split("/")[-1]