Skip to content

Commit

Permalink
Fix premis:originalName parsing, refs #128
Browse files Browse the repository at this point in the history
The previously assumed path of `%transferDirectory%objects` is not valid
for BagIt transfers, which have base directory of
`%transferDirectory%data`. This assumption created DIP file names with
their first three characters truncated, when the original transfer was a
BagIt bag.
  • Loading branch information
djjuhasz committed Nov 15, 2021
1 parent 2ee73fa commit e5a9f80
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 9 deletions.
39 changes: 31 additions & 8 deletions aips/create_dip.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,18 @@ def create_dip(aip_dir, aip_uuid, output_dir, mets_type, dip_type):

premis = techmd.contents.document
update_premis_ns(premis, namespaces, premis_map)
original_name = get_original_name(premis, namespaces)

# Move original file with original name and create parent folders
dip_file_path = os.path.join(to_zip_dir, original_name[27:])
original_name = get_premis_original_name(premis, namespaces)
if not original_name:
LOGGER.warning("Could not get original file name from premis:originalName")
continue

original_relpath = get_original_relpath(original_name)
if not original_relpath:
continue

# Move original file with original file name and create parent folders
dip_file_path = os.path.join(to_zip_dir, original_relpath)
dip_dir_path = os.path.dirname(dip_file_path)
if not os.path.exists(dip_dir_path):
os.makedirs(dip_dir_path)
Expand Down Expand Up @@ -411,17 +419,32 @@ def update_premis_ns(premis, namespaces, premis_map):
)


def get_original_name(premis, namespaces):
"""Get original filename from PREMIS record"""
def get_premis_original_name(premis, namespaces):
"""Get the original file name from a premis:originalName"""

original_name = premis.findtext("premis:originalName", namespaces=namespaces)
if not original_name:
LOGGER.warning("premis:originalName could not be found")
string_start = "%transferDirectory%objects/"
if original_name[:27] != string_start:
LOGGER.warning("premis:originalName not starting with %s", string_start)
return None

return original_name


def get_original_relpath(original_name):
"""Get the relative file path from a premis:originalName"""

path_prefixes = ["%transferDirectory%objects/", "%transferDirectory%data/"]
for prefix in path_prefixes:
if original_name.startswith(prefix):
return original_name[len(prefix) :]

LOGGER.warning(
'"%s" has an invalid path prefix, it must be one of ("%s")',
original_name,
'", "'.join(path_prefixes),
)


if __name__ == "__main__":

parser = argparse.ArgumentParser(
Expand Down
18 changes: 17 additions & 1 deletion tests/test_create_dip.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/env python
import zipfile

import os
import time
import unittest
import vcr
import zipfile

import amclient

Expand Down Expand Up @@ -134,3 +135,18 @@ def test_create_dip_fail_no_aip_dir(self):
"bad_path", AIP_UUID, OUTPUT_DIR, "atom", "zipped-objects"
)
assert dip_dir is None

def test_get_original_relpath_objects_dir(self):
path = "%transferDirectory%objects/folder1/file5.txt"

assert create_dip.get_original_relpath(path) == "folder1/file5.txt"

def test_get_original_relpath_data_dir(self):
path = "%transferDirectory%data/folder1/file5.txt"

assert create_dip.get_original_relpath(path) == "folder1/file5.txt"

def test_get_original_relpath_warn_invalid_prefix(self):
path = "%transferDirectory%datas/folder1/file5.txt"

assert create_dip.get_original_relpath(path) is None

0 comments on commit e5a9f80

Please sign in to comment.