Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

De-duplicate checksum verification #1012

Merged
merged 4 commits into from
Mar 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/MCPClient/lib/archivematicaClientModules
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,10 @@ trimVerifyChecksums_v0.0 = %clientScriptsDirectory%trimVerifyChecksums.py
trimVerifyManifest_v0.0 = %clientScriptsDirectory%trimVerifyManifest.py
updateSizeAndChecksum_v0.0 = %clientScriptsDirectory%archivematicaUpdateSizeAndChecksum.py
validateFile_v1.0 = %clientScriptsDirectory%validateFile.py
verifyAIP_v0.0 = %clientScriptsDirectory%verifyAIP.py
verifyAIP_v1.0 = %clientScriptsDirectory%verify_AIP.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://www.python.org/dev/peps/pep-0008/#package-and-module-names

Modules should have short, all-lowercase names. Underscores can be used in the module name if it improves readability. Python packages should also have short, all-lowercase names, although the use of underscores is discouraged.

verifyAndRestructureTransferBag_v0.0 = %clientScriptsDirectory%verifyAndRestructureTransferBag.py
verifyBAG_v0.0 = %clientScriptsDirectory%verifyBAG.py
verifyChecksumsInFileSecOfDspaceMETSFiles_v0.0 = %clientScriptsDirectory%verifyChecksumsInFileSecOfDspaceMETSFiles.py
verifyMD5_v0.0 = %clientScriptsDirectory%verifyMD5.sh
verifyPREMISChecksums_v0.0 = %clientScriptsDirectory%verifyPREMISChecksums.py
verifySIPCompliance_v0.0 = %clientScriptsDirectory%verifySIPCompliance.py
verifyTransferCompliance_v0.0 = %clientScriptsDirectory%verifyTransferCompliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

import django
django.setup()
from django.conf import settings as mcpclient_settings

# archivematicaCommon
from archivematicaFunctions import get_setting
Expand Down Expand Up @@ -112,7 +113,8 @@ def bag_with_empty_directories(operation, destination, sip_directory, payload_en
help='All the files/folders that should go in the bag.')
parser.add_argument('--writer', dest='writer')

algorithm = get_setting('checksum_type', 'sha512')
algorithm = get_setting(
'checksum_type', mcpclient_settings.DEFAULT_CHECKSUM_ALGORITHM)

args = parser.parse_args()
bag_with_empty_directories(args.operation, args.destination, args.sip_directory, args.payload_entries, args.writer, algorithm)
4 changes: 3 additions & 1 deletion src/MCPClient/lib/clientScripts/storeAIP.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import argparse
import os
from pprint import pformat
import sys
from uuid import uuid4

Expand Down Expand Up @@ -172,7 +173,8 @@ def store_aip(aip_destination_uri, aip_path, sip_uuid, sip_name, sip_type):
)

if new_file is not None and new_file.get('status', '') != "FAIL":
message = "Storage service created {}: {}".format(sip_type, new_file)
message = "Storage service created {}:\n{}".format(
sip_type, pformat(new_file))
LOGGER.info(message)
print(message)
sys.exit(0)
Expand Down
88 changes: 0 additions & 88 deletions src/MCPClient/lib/clientScripts/verifyPREMISChecksums.py

This file was deleted.

251 changes: 251 additions & 0 deletions src/MCPClient/lib/clientScripts/verify_aip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
#!/usr/bin/env python2
from __future__ import print_function
import os
from pprint import pformat
import shutil
import sys

# archivematicaCommon
from archivematicaFunctions import get_setting
from custom_handlers import get_script_logger
import databaseFunctions
from executeOrRunSubProcess import executeOrRun

from main.models import File


class VerifyChecksumsError(Exception):
"""Checksum verification has failed."""


def extract_aip(aip_path, extract_path):
os.makedirs(extract_path)
command = "atool --extract-to={} -V0 {}".format(extract_path, aip_path)
print('Running extraction command:', command)
exit_code, _, _ = executeOrRun("command", command, printing=True)
if exit_code != 0:
raise Exception("Error extracting AIP")

aip_identifier, ext = os.path.splitext(os.path.basename(aip_path))
if ext in ('.bz2', '.gz'):
aip_identifier, _ = os.path.splitext(aip_identifier)
return os.path.join(extract_path, aip_identifier)


def write_premis_event(sip_uuid, checksum_type, event_outcome,
event_outcome_detail_note):
"""Write the AIP-level "fixity check" PREMIS event."""
try:
databaseFunctions.insertIntoEvents(
fileUUID=sip_uuid,
eventType='fixity check',
eventDetail='program="python, bag"; module="hashlib.{}()"'.format(
checksum_type),
eventOutcome=event_outcome,
eventOutcomeDetailNote=event_outcome_detail_note
)
except Exception as err:
print('Failed to write PREMIS event to database. Error: {error}'.format(
error=err))
else:
return event_outcome_detail_note


def get_manifest_path(bag, sip_uuid, checksum_type):
"""Raise exception if if the Bag manifest file is not a file."""
manifest_path = os.path.join(
bag, 'manifest-{algo}.txt'.format(algo=checksum_type))
if not os.path.isfile(manifest_path):
event_outcome_detail_note = (
'Unable to perform AIP-level fixity check on AIP {aip_uuid} because'
' unable to find a Bag manifest file at expected path'
' {manifest_path}.'.format(aip_uuid=sip_uuid,
manifest_path=manifest_path))
raise VerifyChecksumsError(
write_premis_event(sip_uuid, checksum_type, 'Fail',
event_outcome_detail_note))
return manifest_path


def parse_manifest(manifest_path, sip_uuid, checksum_type):
"""Raise exception if the Bag manifest file cannot be parsed."""
with open(manifest_path) as filei:
try:
return {
k.replace('data/', '', 1): v for k, v in
dict(reversed(line.split()) for line in filei).items()}
except Exception as err:
event_outcome_detail_note = (
'Unable to perform AIP-level fixity check on AIP {aip_uuid}'
' because unable to parse manifest file at path'
' {manifest_path}. Error:\n{error}'.format(
aip_uuid=sip_uuid,
manifest_path=manifest_path,
error=err))
raise VerifyChecksumsError(
write_premis_event(sip_uuid, checksum_type, 'Fail',
event_outcome_detail_note))


def assert_checksum_types_match(file_, sip_uuid, settings_checksum_type):
"""Raise exception if checksum types (i.e., algorithms, e.g., 'sha256') of
the file and the settings do not match.
"""
if file_.checksumtype != settings_checksum_type:
event_outcome_detail_note = (
'The checksum type of file {file_uuid} is'
' {file_checksum_type}; given the current application settings, we'
' expect it to {settings_checksum_type}'.format(
file_uuid=file_.uuid,
file_checksum_type=file_.checksumtype,
settings_checksum_type=settings_checksum_type))
raise VerifyChecksumsError(
write_premis_event(sip_uuid, settings_checksum_type, 'Fail',
event_outcome_detail_note))


def get_expected_checksum(file_, sip_uuid, checksum_type, path2checksum,
file_path, manifest_path):
"""Raise an exception if an expected checksum cannot be found in the
Bag manifest.
"""
try:
return path2checksum[file_path]
except KeyError:
event_outcome_detail_note = (
'Unable to find expected path {expected_path} for file'
' {file_uuid} in the following mapping from file paths to'
' checksums that was extracted from Bag manifest file'
' {manifest_file}: {mapping}'.format(
expected_path=file_path,
file_uuid=file_.uuid,
manifest_file=manifest_path,
mapping=pformat(path2checksum)))
raise VerifyChecksumsError(
write_premis_event(sip_uuid, checksum_type, 'Fail',
event_outcome_detail_note))


def assert_checksums_match(file_, sip_uuid, checksum_type, expected_checksum):
"""Raise an exception if checksums do not match."""
if file_.checksum != expected_checksum:
event_outcome_detail_note = (
'The checksum {db_checksum} for file {file_uuid} from the'
' database did not match the corresponding checksum from the'
' Bag manifest file {manifest_checksum}'.format(
file_uuid=file_.uuid,
db_checksum=file_.checksum,
manifest_checksum=expected_checksum))
raise VerifyChecksumsError(
write_premis_event(sip_uuid, checksum_type, 'Fail',
event_outcome_detail_note))


def verify_checksums(bag, sip_uuid):
"""Verify that the checksums generated at the beginning of transfer match
those generated near the end of ingest by bag, i.e., "Prepare AIP"
(bagit_v0.0).
"""
checksum_type = get_setting(
'checksum_type', mcpclient_settings.DEFAULT_CHECKSUM_ALGORITHM)
try:
manifest_path = get_manifest_path(bag, sip_uuid, checksum_type)
path2checksum = parse_manifest(manifest_path, sip_uuid, checksum_type)
verification_count = 0
for file_ in File.objects.filter(sip_id=sip_uuid):
if not file_.currentlocation.startswith('%SIPDirectory%objects/'):
continue
file_path = file_.currentlocation.replace('%SIPDirectory%', '', 1)
assert_checksum_types_match(file_, sip_uuid, checksum_type)
expected_checksum = get_expected_checksum(
file_, sip_uuid, checksum_type, path2checksum, file_path,
manifest_path)
assert_checksums_match(file_, sip_uuid, checksum_type,
expected_checksum)
verification_count += 1
except VerifyChecksumsError as err:
print(err)
raise
event_outcome_detail_note = (
'All {verification_count} checksums generated at start of transfer'
' match those generated by BagIt (bag).'.format(
verification_count=verification_count))
write_premis_event(sip_uuid, checksum_type, 'Pass',
event_outcome_detail_note)
print(event_outcome_detail_note)


def verify_aip():
"""Verify the AIP was bagged correctly by extracting it and running
verification on its contents. This is also where we verify the checksums
now that the verifyPREMISChecksums_v0.0 ("Verify checksums generated on
ingest") micro-service has been removed. It was removed because verifying
checksums by calculating them in that MS and then having bagit calculate
them here was redundant.

sys.argv[1] = UUID
UUID of the SIP, which will become the UUID of the AIP
sys.argv[2] = current location
Full absolute path to the AIP's current location on the local filesystem
"""

sip_uuid = sys.argv[1] # %sip_uuid%
aip_path = sys.argv[2] # SIPDirectory%%sip_name%-%sip_uuid%.7z

temp_dir = mcpclient_settings.TEMP_DIRECTORY

is_uncompressed_aip = os.path.isdir(aip_path)

if is_uncompressed_aip:
bag = aip_path
else:
try:
extract_dir = os.path.join(temp_dir, sip_uuid)
bag = extract_aip(aip_path, extract_dir)
except Exception:
print('Error extracting AIP at "{}"'.format(aip_path), file=sys.stderr)
return 1

verification_commands = [
'/usr/share/bagit/bin/bag verifyvalid "{}"'.format(bag),
'/usr/share/bagit/bin/bag checkpayloadoxum "{}"'.format(bag),
'/usr/share/bagit/bin/bag verifycomplete "{}"'.format(bag),
'/usr/share/bagit/bin/bag verifypayloadmanifests "{}"'.format(bag),
'/usr/share/bagit/bin/bag verifytagmanifests "{}"'.format(bag),
]
return_code = 0
for command in verification_commands:
print("Running test: ", command)
exit_code, _, _ = executeOrRun("command", command, printing=True)
if exit_code != 0:
print("Failed test: ", command, file=sys.stderr)
return_code = 1

if return_code == 0:
try:
verify_checksums(bag, sip_uuid)
except VerifyChecksumsError:
return_code = 1
else:
print('Not verifying checksums because other tests have already'
' failed.')

# cleanup
if not is_uncompressed_aip:
try:
shutil.rmtree(extract_dir)
except OSError as err:
print('Failed to remove temporary directory at {extract_dir} which'
' contains the AIP extracted for verification.'
' Error:\n{err}'.format(extract_dir=extract_dir, err=err),
file=sys.stderr)

return return_code


if __name__ == '__main__':
import django
django.setup()
from django.conf import settings as mcpclient_settings
logger = get_script_logger("archivematica.mcp.client.verifyAIP")
sys.exit(verify_aip())
1 change: 1 addition & 0 deletions src/MCPClient/lib/settings/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,4 @@
AGENTARCHIVES_CLIENT_TIMEOUT = config.get('agentarchives_client_timeout')
SEARCH_ENABLED = config.get('search_enabled')
CAPTURE_CLIENT_SCRIPT_OUTPUT = config.get('capture_client_script_output')
DEFAULT_CHECKSUM_ALGORITHM = 'sha256'
Loading