Skip to content

Commit

Permalink
Updates based on discussion in PR #205
Browse files Browse the repository at this point in the history
* Remove if not record block from __next__ method as it relates to an outdated method of flow control
*Rename _transform to transform and remove transform methods from JSONTransformer and XMLTransformer
* Refactor generate_derived_fields and get_optional_field_methods
* Remove try/except block from transform method
  • Loading branch information
ehanson8 committed Aug 5, 2024
1 parent c1d9e88 commit b1ec2e7
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 74 deletions.
14 changes: 0 additions & 14 deletions transmogrifier/sources/jsontransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
if TYPE_CHECKING:
from collections.abc import Iterator

import transmogrifier.models as timdex


class JSONTransformer(Transformer):
"""JSON transformer class."""
Expand All @@ -33,18 +31,6 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON]]:
with jsonlines.Reader(smart_open.open(source_file, "r")) as records:
yield from records.iter(type=dict)

@final
def transform(self, source_record: dict[str, JSON]) -> timdex.TimdexRecord | None:
"""
Call Transformer._transform method to transform JSON record to TIMDEX record.
May not be overridden.
Args:
source_record: A JSON object representing a source record.
"""
return self._transform(source_record)

@classmethod
@abstractmethod
def get_main_titles(cls, source_record: dict[str, JSON]) -> list[str]:
Expand Down
78 changes: 32 additions & 46 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,6 @@ def __next__(self) -> timdex.TimdexRecord:
except SkippedRecordEvent:
self.skipped_record_count += 1
continue
if not record:
self.skipped_record_count += 1
continue
self.transformed_record_count += 1
return record

Expand Down Expand Up @@ -225,12 +222,13 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON] | Tag]:
"""

@final
def _transform(
self, source_record: dict[str, JSON] | Tag
) -> timdex.TimdexRecord | None:
def transform(self, source_record: dict[str, JSON] | Tag) -> timdex.TimdexRecord:
"""
Private method called for both XML and JSON transformations, where
all logic is shared except source_record type.
Transform source record into TimdexRecord instance.
Instantiates a TimdexRecord instance with required fields and runs fields methods
for optional fields. The optional field methods return values or exceptions that
prompt the __next__ method to skip the entire record.
May not be overridden.
Expand All @@ -247,30 +245,14 @@ def _transform(
timdex_record_id=self.get_timdex_record_id(source_record),
title=self.get_valid_title(source_record),
)

for field_name, field_method in self.get_optional_field_methods():
try:
setattr(timdex_record, field_name, field_method(source_record))
except: # noqa: E722
logger.exception("Exception occurred while setting '%s'", field_name)
raise SkippedRecordEvent from None
setattr(timdex_record, field_name, field_method(source_record))

self.generate_derived_fields(timdex_record)

return timdex_record

@abstractmethod
def transform(
self, source_record: dict[str, JSON] | Tag
) -> timdex.TimdexRecord | None:
"""
Call Transformer._transform method to transform source record to TIMDEX record.
Must be overridden by format subclasses.
Args:
source_record: A single source record.
"""

@classmethod
@abstractmethod
def get_main_titles(cls, source_record: dict[str, JSON] | Tag) -> list[str]:
Expand Down Expand Up @@ -339,12 +321,10 @@ def get_optional_field_methods(self) -> Iterator[tuple[str, Callable]]:
May not be overridden.
"""
for field_name in [
field_name
for field_name in timdex.TimdexRecord.get_optional_field_names()
if hasattr(self, f"get_{field_name}")
]:
yield field_name, getattr(self, f"get_{field_name}")
for field_name in timdex.TimdexRecord.get_optional_field_names():
field_method = getattr(self, f"get_{field_name}", None)
if field_method:
yield field_name, getattr(self, f"get_{field_name}")

@final
def generate_derived_fields(
Expand All @@ -353,28 +333,34 @@ def generate_derived_fields(
"""
Generate field values based on existing values in TIMDEX record.
This method sets or extends the following fields:
- dates: list[Date]
- locations: list[Location]
- citation: str
- content_type: str
May not be overridden.
"""
derived_dates = list(self.create_dates_from_publishers(timdex_record))
if not timdex_record.dates:
timdex_record.dates = derived_dates if derived_dates else None
else:
timdex_record.dates.extend(derived_dates)
# dates
derived_dates = timdex_record.dates or []
derived_dates.extend(self.create_dates_from_publishers(timdex_record))
timdex_record.dates = derived_dates or None

derived_locations: list[timdex.Location] = []
# locations
derived_locations = timdex_record.locations or []
derived_locations.extend(self.create_locations_from_publishers(timdex_record))
derived_locations.extend(
self.create_locations_from_spatial_subjects(timdex_record)
)
if not timdex_record.locations:
timdex_record.locations = derived_locations if derived_locations else None
else:
timdex_record.locations.extend(derived_locations)
timdex_record.locations = derived_locations or None

# citation
timdex_record.citation = timdex_record.citation or generate_citation(
timdex_record
)

if timdex_record.citation is None:
timdex_record.citation = generate_citation(timdex_record)
if timdex_record.content_type is None:
timdex_record.content_type = ["Not specified"]
# content type
timdex_record.content_type = timdex_record.content_type or ["Not specified"]

return timdex_record

Expand Down
14 changes: 0 additions & 14 deletions transmogrifier/sources/xmltransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
if TYPE_CHECKING:
from collections.abc import Iterator

import transmogrifier.models as timdex


class XMLTransformer(Transformer):
"""XML transformer class."""
Expand Down Expand Up @@ -44,18 +42,6 @@ def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
yield record
element.clear()

@final
def transform(self, source_record: Tag) -> timdex.TimdexRecord | None:
"""
Call Transformer._transform method to transform XML record to TIMDEX record.
May not be overridden.
Args:
source_record: A BeautifulSoup Tag representing a single XML record.
"""
return self._transform(source_record)

@classmethod
def get_main_titles(cls, _source_record: Tag) -> list[Tag]:
"""
Expand Down

0 comments on commit b1ec2e7

Please sign in to comment.