diff --git a/tests/sources/xml/test_marc.py b/tests/sources/xml/test_marc.py
index ca80504..cc5686c 100644
--- a/tests/sources/xml/test_marc.py
+++ b/tests/sources/xml/test_marc.py
@@ -1236,6 +1236,116 @@ def test_get_holdings_transforms_correctly_if_fields_missing():
assert Marc.get_holdings(source_record) is None
+def test_get_identifiers_success():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+ 2005022317
+
+
+ 9781250185969
+ hardcover
+
+
+ 0033-0736
+
+
+ 10.1596/978-0-8213-7468-9
+ doi
+
+
+ (OCoLC)1312285564
+
+ """
+ )
+ )
+ assert Marc.get_identifiers(source_record) == [
+ timdex.Identifier(value="2005022317", kind="LCCN"),
+ timdex.Identifier(value="9781250185969. hardcover", kind="ISBN"),
+ timdex.Identifier(value="0033-0736", kind="ISSN"),
+ timdex.Identifier(
+ value="10.1596/978-0-8213-7468-9. doi", kind="Other Identifier"
+ ),
+ timdex.Identifier(value="1312285564", kind="OCLC Number"),
+ ]
+
+
+def test_get_identifiers_transforms_correctly_if_fields_blank():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+
+
+ """
+ )
+ )
+ assert Marc.get_identifiers(source_record) is None
+
+
+def test_get_identifiers_transforms_correctly_if_fields_missing():
+ source_record = create_marc_source_record_stub()
+ assert Marc.get_identifiers(source_record) is None
+
+
+def test_get_languages_success():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+ eng
+ fre
+
+
+ Sung in French.
+
+ """
+ )
+ )
+ assert Marc.get_languages(source_record) == [
+ "No linguistic content",
+ "English",
+ "French",
+ "Sung in French",
+ ]
+
+
+def test_get_languages_transforms_correctly_if_char_positions_blank():
+ source_record = create_marc_source_record_stub(
+ control_field_insert=(
+ '170906s2016 fr mun| o e d'
+ )
+ )
+ assert Marc.get_languages(source_record) is None
+
+
+def test_get_languages_transforms_correctly_if_fields_blank():
+ source_record = create_marc_source_record_stub(
+ '170906s2016 fr mun| o e d',
+ datafield_insert=(
+ """
+
+
+
+ """
+ ),
+ )
+ assert Marc.get_languages(source_record) is None
+
+
+def test_get_literary_form_success():
+ source_record = create_marc_source_record_stub()
+ assert Marc.get_literary_form(source_record) == "Nonfiction"
+
+
+def test_get_literary_form_transforms_correctly_if_char_positions_blank():
+ source_record = create_marc_source_record_stub(
+ leader_field_insert="03282n 2200721Ki 4500"
+ )
+ assert Marc.get_literary_form(source_record) is None
+
+
def test_get_links_success():
source_record = create_marc_source_record_stub(
datafield_insert=(
@@ -1298,6 +1408,148 @@ def test_get_links_transforms_correctly_if_fields_missing():
assert Marc.get_links(source_record) is None
+def test_get_locations_success():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+ Germany
+
+
+ Africa
+ Nile River
+ Sixth Cataract.
+
+ """
+ )
+ )
+ assert Marc.get_locations(source_record) == [
+ timdex.Location(value="France", kind="Place of Publication"),
+ timdex.Location(value="Germany", kind="Geographic Name"),
+ timdex.Location(
+ value="Africa - Nile River - Sixth Cataract", kind="Hierarchical Place Name"
+ ),
+ ]
+
+
+def test_marc_get_locations_transforms_correctly_if_char_positions_blank():
+ source_record = create_marc_source_record_stub(
+ control_field_insert=(
+ """
+ 170906s2016 mun| o e zxx d
+ """
+ )
+ )
+ assert Marc.get_locations(source_record) is None
+
+
+def test_marc_get_locations_transforms_correctly_if_fields_blank():
+ source_record = create_marc_source_record_stub(
+ control_field_insert=(
+ """
+ 170906s2016 mun| o e zxx d
+ """
+ ),
+ datafield_insert=(
+ """
+
+
+
+ """
+ ),
+ )
+ assert Marc.get_locations(source_record) is None
+
+
+def test_get_notes_success():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+ arranged by the Arts Council of Great Britain.
+
+
+ Opera in 5 acts.
+
+
+ Thesis (D.SC.)--University of London.
+
+
+ Includes bibliographical references and index.
+
+
+ Producer, Toygun Kirali.
+
+
+ Lamoureux Concerts Orchestra ; Igor Markevitch, conductor.
+
+
+ Suspended publication 1944-52.
+
+
+ Canada.
+
+
+ Electronic reproduction.
+ New York :
+ Springer,
+ 2008.
+
+
+ Originally published
+ New York : Garland, 1987.
+
+
+ Hard copy version record.
+
+
+ Rare Book copy: Advance copy notice inserted.
+
+ """
+ )
+ )
+ assert Marc.get_notes(source_record) == [
+ timdex.Note(
+ value=["arranged by the Arts Council of Great Britain"],
+ kind="Title Statement of Responsibility",
+ ),
+ timdex.Note(value=["Opera in 5 acts"], kind="General Note"),
+ timdex.Note(
+ value=["Thesis (D.SC.)--University of London"], kind="Dissertation Note"
+ ),
+ timdex.Note(
+ value=["Includes bibliographical references and index"],
+ kind="Bibliography Note",
+ ),
+ timdex.Note(
+ value=["Producer, Toygun Kirali"], kind="Creation/Production Credits Note"
+ ),
+ timdex.Note(
+ value=["Lamoureux Concerts Orchestra ; Igor Markevitch, conductor"],
+ kind="Participant or Performer Note",
+ ),
+ timdex.Note(
+ value=["Suspended publication 1944-52"], kind="Numbering Peculiarities Note"
+ ),
+ timdex.Note(value=["Canada"], kind="Geographic Coverage Note"),
+ timdex.Note(
+ value=["Electronic reproduction. New York : Springer, 2008"],
+ kind="Reproduction Note",
+ ),
+ timdex.Note(
+ value=["Originally published New York : Garland, 1987"],
+ kind="Original Version Note",
+ ),
+ timdex.Note(
+ value=["Hard copy version record"],
+ kind="Source of Description Note",
+ ),
+ timdex.Note(
+ value=["Rare Book copy: Advance copy notice inserted"], kind="Local Note"
+ ),
+ ]
+
+
def test_marc_record_missing_leader_skips_record(caplog):
marc_xml_records = Marc.parse_source_file(
"tests/fixtures/marc/marc_record_missing_leader.xml"
diff --git a/transmogrifier/sources/xml/marc.py b/transmogrifier/sources/xml/marc.py
index 9f85ad4..dfb72e7 100644
--- a/transmogrifier/sources/xml/marc.py
+++ b/transmogrifier/sources/xml/marc.py
@@ -42,8 +42,6 @@ def get_optional_fields(self, source_record: Tag) -> dict | None:
"""
fields: dict = {}
- source_record_id = self.get_source_record_id(source_record)
-
# alternate titles
fields["alternate_titles"] = self.get_alternate_titles(source_record)
@@ -77,220 +75,22 @@ def get_optional_fields(self, source_record: Tag) -> dict | None:
fields["holdings"] = self.get_holdings(source_record)
# identifiers
- identifier_marc_fields = [
- {
- "tag": "010",
- "subfields": "a",
- "kind": "LCCN",
- },
- {
- "tag": "020",
- "subfields": "aq",
- "kind": "ISBN",
- },
- {
- "tag": "022",
- "subfields": "a",
- "kind": "ISSN",
- },
- {
- "tag": "024",
- "subfields": "aq2",
- "kind": "Other Identifier",
- },
- {
- "tag": "035",
- "subfields": "a",
- "kind": "OCLC Number",
- },
- ]
- for identifier_marc_field in identifier_marc_fields:
- for datafield in source_record.find_all(
- "datafield", tag=identifier_marc_field["tag"]
- ):
- if identifier_value := (
- self.create_subfield_value_string_from_datafield(
- datafield,
- identifier_marc_field["subfields"],
- ". ",
- )
- ):
- fields.setdefault("identifiers", []).append(
- timdex.Identifier(
- value=identifier_value.strip().replace("(OCoLC)", ""),
- kind=identifier_marc_field["kind"],
- )
- )
+ fields["identifiers"] = self.get_identifiers(source_record)
# languages
- languages = []
-
- # Get language codes
- language_codes = []
- if fixed_language_value := self._get_control_field(source_record)[35:38]:
- language_codes.append(fixed_language_value)
- for field_041 in source_record.find_all("datafield", tag="041"):
- language_codes.extend(
- self.create_subfield_value_list_from_datafield(field_041, "abdefghjkmn")
- )
-
- # Crosswalk codes to names
- for language_code in list(dict.fromkeys(language_codes)):
- if language_name := Marc.loc_crosswalk_code_to_name(
- language_code, self.language_code_crosswalk, source_record_id, "language"
- ):
- languages.append(language_name) # noqa: PERF401
-
- # Add language notes
- for field_546 in source_record.find_all("datafield", tag="546"):
- if language_note := field_546.find("subfield", code="a", string=True):
- languages.append(str(language_note.string).rstrip(" .")) # noqa: PERF401
-
- fields["languages"] = list(dict.fromkeys(languages)) or None
+ fields["languages"] = self.get_languages(source_record)
# links - see also: holdings field for electronic portfolio items
fields["links"] = self.get_links(source_record)
# literary_form
- # Literary form is applicable to Book (BK) material configurations and indicated
- # by leader "Type of Record" position = "Language Material" or "Manuscript
- # language material" and "Bibliographic level" position =
- # "Monographic component part," "Collection," "Subunit," or "Monograph/Item."
- if (
- self._get_leader_field(source_record)[6:7] in "at"
- and self._get_leader_field(source_record)[7:8] in "acdm"
- ):
- if self._get_control_field(source_record)[33:34] in "0se":
- fields["literary_form"] = "Nonfiction"
- elif self._get_control_field(source_record)[33:34]:
- fields["literary_form"] = "Fiction"
+ fields["literary_form"] = self.get_literary_form(source_record)
# locations
-
- # Get place of publication from 008 field code
- if (fixed_location_code := self._get_control_field(source_record)[15:17]) and (
- location_name := Marc.loc_crosswalk_code_to_name(
- fixed_location_code,
- self.country_code_crosswalk,
- source_record_id,
- "country",
- )
- ):
- fields.setdefault("locations", []).append(
- timdex.Location(value=location_name, kind="Place of Publication")
- )
-
- # Get other locations
- location_marc_fields = [
- {
- "tag": "751",
- "subfields": "a",
- "kind": "Geographic Name",
- },
- {
- "tag": "752",
- "subfields": "abcdefgh",
- "kind": "Hierarchical Place Name",
- },
- ]
- for location_marc_field in location_marc_fields:
- for datafield in source_record.find_all(
- "datafield", tag=location_marc_field["tag"]
- ):
- if location_value := (
- self.create_subfield_value_string_from_datafield(
- datafield,
- location_marc_field["subfields"],
- " - ",
- )
- ):
- fields.setdefault("locations", []).append(
- timdex.Location(
- value=location_value.rstrip(" .,/)"),
- kind=location_marc_field["kind"],
- )
- )
+ fields["locations"] = self.get_locations(source_record)
# notes
- note_marc_fields = [
- {
- "tag": "245",
- "subfields": "c",
- "kind": "Title Statement of Responsibility",
- },
- {
- "tag": "500",
- "subfields": "a",
- "kind": "General Note",
- },
- {
- "tag": "502",
- "subfields": "abcdg",
- "kind": "Dissertation Note",
- },
- {
- "tag": "504",
- "subfields": "a",
- "kind": "Bibliography Note",
- },
- {
- "tag": "508",
- "subfields": "a",
- "kind": "Creation/Production Credits Note",
- },
- {
- "tag": "511",
- "subfields": "a",
- "kind": "Participant or Performer Note",
- },
- {
- "tag": "515",
- "subfields": "a",
- "kind": "Numbering Peculiarities Note",
- },
- {
- "tag": "522",
- "subfields": "a",
- "kind": "Geographic Coverage Note",
- },
- {
- "tag": "533",
- "subfields": "abcdefmn",
- "kind": "Reproduction Note",
- },
- {
- "tag": "534",
- "subfields": "abcefklmnoptxz",
- "kind": "Original Version Note",
- },
- {
- "tag": "588",
- "subfields": "a",
- "kind": "Source of Description Note",
- },
- {
- "tag": "590",
- "subfields": "a",
- "kind": "Local Note",
- },
- ]
- for note_marc_field in note_marc_fields:
- for datafield in source_record.find_all(
- "datafield", tag=note_marc_field["tag"]
- ):
- if note_value := (
- self.create_subfield_value_string_from_datafield(
- datafield,
- note_marc_field["subfields"],
- " ",
- )
- ):
- fields.setdefault("notes", []).append(
- timdex.Note(
- value=[note_value.rstrip(" .")],
- kind=note_marc_field["kind"],
- )
- )
+ fields["notes"] = self.get_notes(source_record)
# numbering
@@ -850,6 +650,104 @@ def _get_holdings_electronic_items(
note=holding_note,
)
+ @classmethod
+ def get_identifiers(cls, source_record: Tag) -> list[timdex.Identifier] | None:
+ identifiers = []
+ identifier_marc_fields = [
+ {
+ "tag": "010",
+ "subfields": "a",
+ "kind": "LCCN",
+ },
+ {
+ "tag": "020",
+ "subfields": "aq",
+ "kind": "ISBN",
+ },
+ {
+ "tag": "022",
+ "subfields": "a",
+ "kind": "ISSN",
+ },
+ {
+ "tag": "024",
+ "subfields": "aq2",
+ "kind": "Other Identifier",
+ },
+ {
+ "tag": "035",
+ "subfields": "a",
+ "kind": "OCLC Number",
+ },
+ ]
+ for identifier_marc_field in identifier_marc_fields:
+ identifiers.extend(
+ [
+ timdex.Identifier(
+ value=identifier.strip().replace("(OCoLC)", ""),
+ kind=identifier_marc_field["kind"],
+ )
+ for datafield in source_record.find_all(
+ "datafield", tag=identifier_marc_field["tag"]
+ )
+ if (
+ identifier := (
+ cls.create_subfield_value_string_from_datafield(
+ datafield,
+ identifier_marc_field["subfields"],
+ ". ",
+ )
+ )
+ )
+ ]
+ )
+ return identifiers or None
+
+ @classmethod
+ def get_languages(cls, source_record: Tag) -> list[str] | None:
+
+ languages = []
+ language_codes: list[str] = []
+
+ # get language codes from control field 008/35-37
+ if fixed_language_value := cls._get_control_field(source_record)[35:38].strip():
+ language_codes.append(fixed_language_value)
+
+ # get language codes from data field 041
+ for datafield in source_record.find_all("datafield", tag="041"):
+ language_codes.extend(
+ cls.create_subfield_value_list_from_datafield(datafield, "abdefghjkmn")
+ )
+
+ languages.extend(cls._get_language_names(source_record, language_codes))
+ languages.extend(cls._get_language_notes(source_record))
+ return languages or None
+
+ @classmethod
+ def _get_language_names(
+ cls, source_record: Tag, language_codes: list[str]
+ ) -> list[str]:
+ return [
+ language_name
+ for language_code in list(dict.fromkeys(language_codes))
+ if (
+ language_name := cls.loc_crosswalk_code_to_name(
+ language_code,
+ cls.language_code_crosswalk,
+ cls.get_source_record_id(source_record),
+ "language",
+ )
+ )
+ ]
+
+ @classmethod
+ def _get_language_notes(cls, source_record: Tag) -> list[str]:
+ return [
+ str(language_note.string).rstrip(" .")
+ for datafield in source_record.find_all("datafield", tag="546")
+ if (language_note := datafield.find("subfield", code="a", string=True))
+ ]
+
@classmethod
def get_links(cls, source_record: Tag) -> list[timdex.Link] | None:
links: list[timdex.Link] = []
@@ -895,6 +793,170 @@ def _get_links_holdings_electronic_items(
text=holding_collection,
)
+ @classmethod
+ def get_literary_form(cls, source_record: Tag) -> str | None:
+ """Retrieve literary form for book materials.
+
+ Book materials configurations are used when Leader/06 (Type of record)
+ contains code a (Language material) or t (Manuscript language material)
+ and Leader/07 (Bibliographic level) contains code
+ a (Monographic component part), c (Collection), d (Subunit),
+ or m (Monograph).
+ """
+ leader_field = cls._get_leader_field(source_record)
+ control_field = cls._get_control_field(source_record)
+ if leader_field[6] in "at" and leader_field[7] in "acdm":
+ if control_field[33] in "0se":
+ return "Nonfiction"
+ return "Fiction"
+ return None
+
+ @classmethod
+ def get_locations(cls, source_record: Tag) -> list[timdex.Location] | None:
+ locations = []
+ location_marc_fields = [
+ {
+ "tag": "751",
+ "subfields": "a",
+ "kind": "Geographic Name",
+ },
+ {
+ "tag": "752",
+ "subfields": "abcdefgh",
+ "kind": "Hierarchical Place Name",
+ },
+ ]
+ # get locations (place of publication) from control field 008/15-17
+ if place_of_publication := cls._get_location_publication(source_record):
+ locations.append(place_of_publication)
+
+ # get locations from data fields
+ for location_marc_field in location_marc_fields:
+ locations.extend(
+ [
+ timdex.Location(
+ value=location_value.rstrip(" .,/)"),
+ kind=location_marc_field["kind"],
+ )
+ for datafield in source_record.find_all(
+ "datafield", tag=location_marc_field["tag"]
+ )
+ if (
+ location_value := (
+ cls.create_subfield_value_string_from_datafield(
+ datafield,
+ location_marc_field["subfields"],
+ " - ",
+ )
+ )
+ )
+ ]
+ )
+ return locations or None
+
+ @classmethod
+ def _get_location_publication(cls, source_record: Tag) -> timdex.Location | None:
+ if (
+ fixed_location_code := cls._get_control_field(source_record)[15:18].strip()
+ ) and (
+ location_name := cls.loc_crosswalk_code_to_name(
+ code=fixed_location_code,
+ crosswalk=cls.country_code_crosswalk,
+ record_id=cls.get_source_record_id(source_record),
+ code_type="country",
+ )
+ ):
+ return timdex.Location(value=location_name, kind="Place of Publication")
+ return None
+
+ @classmethod
+ def get_notes(cls, source_record: Tag) -> list[timdex.Note] | None:
+ notes = []
+ note_marc_fields = [
+ {
+ "tag": "245",
+ "subfields": "c",
+ "kind": "Title Statement of Responsibility",
+ },
+ {
+ "tag": "500",
+ "subfields": "a",
+ "kind": "General Note",
+ },
+ {
+ "tag": "502",
+ "subfields": "abcdg",
+ "kind": "Dissertation Note",
+ },
+ {
+ "tag": "504",
+ "subfields": "a",
+ "kind": "Bibliography Note",
+ },
+ {
+ "tag": "508",
+ "subfields": "a",
+ "kind": "Creation/Production Credits Note",
+ },
+ {
+ "tag": "511",
+ "subfields": "a",
+ "kind": "Participant or Performer Note",
+ },
+ {
+ "tag": "515",
+ "subfields": "a",
+ "kind": "Numbering Peculiarities Note",
+ },
+ {
+ "tag": "522",
+ "subfields": "a",
+ "kind": "Geographic Coverage Note",
+ },
+ {
+ "tag": "533",
+ "subfields": "abcdefmn",
+ "kind": "Reproduction Note",
+ },
+ {
+ "tag": "534",
+ "subfields": "abcefklmnoptxz",
+ "kind": "Original Version Note",
+ },
+ {
+ "tag": "588",
+ "subfields": "a",
+ "kind": "Source of Description Note",
+ },
+ {
+ "tag": "590",
+ "subfields": "a",
+ "kind": "Local Note",
+ },
+ ]
+ for note_marc_field in note_marc_fields:
+ notes.extend(
+ [
+ timdex.Note(
+ value=[note_value.rstrip(" .")],
+ kind=note_marc_field["kind"],
+ )
+ for datafield in source_record.find_all(
+ "datafield", tag=note_marc_field["tag"]
+ )
+ if (
+ note_value := (
+ cls.create_subfield_value_string_from_datafield(
+ datafield,
+ note_marc_field["subfields"],
+ " ",
+ )
+ )
+ )
+ ]
+ )
+ return notes or None
+
@staticmethod
def get_main_titles(xml: Tag) -> list[str]:
"""