diff --git a/src/MCPClient/lib/archivematicaClientModules b/src/MCPClient/lib/archivematicaClientModules index 1bd13108c0..31fef9506f 100644 --- a/src/MCPClient/lib/archivematicaClientModules +++ b/src/MCPClient/lib/archivematicaClientModules @@ -130,11 +130,10 @@ trimVerifyChecksums_v0.0 = %clientScriptsDirectory%trimVerifyChecksums.py trimVerifyManifest_v0.0 = %clientScriptsDirectory%trimVerifyManifest.py updateSizeAndChecksum_v0.0 = %clientScriptsDirectory%archivematicaUpdateSizeAndChecksum.py validateFile_v1.0 = %clientScriptsDirectory%validateFile.py -verifyAIP_v0.0 = %clientScriptsDirectory%verifyAIP.py +verifyAIP_v1.0 = %clientScriptsDirectory%verify_AIP.py verifyAndRestructureTransferBag_v0.0 = %clientScriptsDirectory%verifyAndRestructureTransferBag.py verifyBAG_v0.0 = %clientScriptsDirectory%verifyBAG.py verifyChecksumsInFileSecOfDspaceMETSFiles_v0.0 = %clientScriptsDirectory%verifyChecksumsInFileSecOfDspaceMETSFiles.py verifyMD5_v0.0 = %clientScriptsDirectory%verifyMD5.sh -verifyPREMISChecksums_v0.0 = %clientScriptsDirectory%verifyPREMISChecksums.py verifySIPCompliance_v0.0 = %clientScriptsDirectory%verifySIPCompliance.py verifyTransferCompliance_v0.0 = %clientScriptsDirectory%verifyTransferCompliance.py diff --git a/src/MCPClient/lib/clientScripts/archivematicaBagWithEmptyDirectories.py b/src/MCPClient/lib/clientScripts/archivematicaBagWithEmptyDirectories.py index 373948bea5..3ecfb82b32 100755 --- a/src/MCPClient/lib/clientScripts/archivematicaBagWithEmptyDirectories.py +++ b/src/MCPClient/lib/clientScripts/archivematicaBagWithEmptyDirectories.py @@ -27,6 +27,7 @@ import django django.setup() +from django.conf import settings as mcpclient_settings # archivematicaCommon from archivematicaFunctions import get_setting @@ -112,7 +113,8 @@ def bag_with_empty_directories(operation, destination, sip_directory, payload_en help='All the files/folders that should go in the bag.') parser.add_argument('--writer', dest='writer') - algorithm = get_setting('checksum_type', 'sha512') + algorithm = get_setting( + 'checksum_type', mcpclient_settings.DEFAULT_CHECKSUM_ALGORITHM) args = parser.parse_args() bag_with_empty_directories(args.operation, args.destination, args.sip_directory, args.payload_entries, args.writer, algorithm) diff --git a/src/MCPClient/lib/clientScripts/storeAIP.py b/src/MCPClient/lib/clientScripts/storeAIP.py index 116feafa47..ba62341cc9 100755 --- a/src/MCPClient/lib/clientScripts/storeAIP.py +++ b/src/MCPClient/lib/clientScripts/storeAIP.py @@ -24,6 +24,7 @@ import argparse import os +from pprint import pformat import sys from uuid import uuid4 @@ -172,7 +173,8 @@ def store_aip(aip_destination_uri, aip_path, sip_uuid, sip_name, sip_type): ) if new_file is not None and new_file.get('status', '') != "FAIL": - message = "Storage service created {}: {}".format(sip_type, new_file) + message = "Storage service created {}:\n{}".format( + sip_type, pformat(new_file)) LOGGER.info(message) print(message) sys.exit(0) diff --git a/src/MCPClient/lib/clientScripts/verifyPREMISChecksums.py b/src/MCPClient/lib/clientScripts/verifyPREMISChecksums.py deleted file mode 100755 index 8a714763cd..0000000000 --- a/src/MCPClient/lib/clientScripts/verifyPREMISChecksums.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python2 - -# This file is part of Archivematica. -# -# Copyright 2010-2013 Artefactual Systems Inc. -# -# Archivematica is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Archivematica is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Archivematica. If not, see . - -# @package Archivematica -# @subpackage archivematicaClientScript -# @author Joseph Perry - -from __future__ import print_function -import sys -from optparse import OptionParser -import uuid - -import django -django.setup() - -# dashboard -from main.models import File - -# archivematicaCommon -from custom_handlers import get_script_logger -import databaseFunctions -from archivematicaFunctions import get_file_checksum - - -def verifyChecksum(fileUUID, filePath, date, eventIdentifierUUID): - f = File.objects.get(uuid=fileUUID) - - if f.checksum in ('', 'None'): - print('No checksum found in database for file:', fileUUID, filePath, file=sys.stderr) - exit(1) - - checksumFile = get_file_checksum(filePath, f.checksumtype) - - eventOutcome = '' - eventOutcomeDetailNote = '' - exitCode = 0 - - if checksumFile != f.checksum: - eventOutcomeDetailNote = str(checksumFile) + ' != ' + f.checksum - eventOutcome = 'Fail' - exitCode = 2 - print('Checksums do not match:', fileUUID, filePath, file=sys.stderr) - print(eventOutcomeDetailNote, file=sys.stderr) - else: - eventOutcomeDetailNote = '%s %s' % (str(checksumFile), 'verified') - eventOutcome = 'Pass' - exitCode = 0 - - databaseFunctions.insertIntoEvents( - fileUUID=fileUUID, - eventIdentifierUUID=str(uuid.uuid4()), - eventType='fixity check', - eventDateTime=date, - eventOutcome=eventOutcome, - eventOutcomeDetailNote=eventOutcomeDetailNote, - eventDetail='program="python"; module="hashlib.{}()"'.format(f.checksumtype) - ) - - exit(exitCode) - - -if __name__ == '__main__': - logger = get_script_logger('archivematica.mcp.client.verifyPREMISChecksums') - - parser = OptionParser() - parser.add_option('-i', '--fileUUID', action='store', dest='fileUUID', default='') - parser.add_option('-p', '--filePath', action='store', dest='filePath', default='') - parser.add_option('-d', '--date', action='store', dest='date', default='') - parser.add_option('-u', '--eventIdentifierUUID', action='store', dest='eventIdentifierUUID', default='') - (opts, args) = parser.parse_args() - - verifyChecksum(opts.fileUUID, opts.filePath, opts.date, opts.eventIdentifierUUID) diff --git a/src/MCPClient/lib/clientScripts/verify_aip.py b/src/MCPClient/lib/clientScripts/verify_aip.py new file mode 100755 index 0000000000..e276889548 --- /dev/null +++ b/src/MCPClient/lib/clientScripts/verify_aip.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python2 +from __future__ import print_function +import os +from pprint import pformat +import shutil +import sys + +# archivematicaCommon +from archivematicaFunctions import get_setting +from custom_handlers import get_script_logger +import databaseFunctions +from executeOrRunSubProcess import executeOrRun + +from main.models import File + + +class VerifyChecksumsError(Exception): + """Checksum verification has failed.""" + + +def extract_aip(aip_path, extract_path): + os.makedirs(extract_path) + command = "atool --extract-to={} -V0 {}".format(extract_path, aip_path) + print('Running extraction command:', command) + exit_code, _, _ = executeOrRun("command", command, printing=True) + if exit_code != 0: + raise Exception("Error extracting AIP") + + aip_identifier, ext = os.path.splitext(os.path.basename(aip_path)) + if ext in ('.bz2', '.gz'): + aip_identifier, _ = os.path.splitext(aip_identifier) + return os.path.join(extract_path, aip_identifier) + + +def write_premis_event(sip_uuid, checksum_type, event_outcome, + event_outcome_detail_note): + """Write the AIP-level "fixity check" PREMIS event.""" + try: + databaseFunctions.insertIntoEvents( + fileUUID=sip_uuid, + eventType='fixity check', + eventDetail='program="python, bag"; module="hashlib.{}()"'.format( + checksum_type), + eventOutcome=event_outcome, + eventOutcomeDetailNote=event_outcome_detail_note + ) + except Exception as err: + print('Failed to write PREMIS event to database. Error: {error}'.format( + error=err)) + else: + return event_outcome_detail_note + + +def get_manifest_path(bag, sip_uuid, checksum_type): + """Raise exception if if the Bag manifest file is not a file.""" + manifest_path = os.path.join( + bag, 'manifest-{algo}.txt'.format(algo=checksum_type)) + if not os.path.isfile(manifest_path): + event_outcome_detail_note = ( + 'Unable to perform AIP-level fixity check on AIP {aip_uuid} because' + ' unable to find a Bag manifest file at expected path' + ' {manifest_path}.'.format(aip_uuid=sip_uuid, + manifest_path=manifest_path)) + raise VerifyChecksumsError( + write_premis_event(sip_uuid, checksum_type, 'Fail', + event_outcome_detail_note)) + return manifest_path + + +def parse_manifest(manifest_path, sip_uuid, checksum_type): + """Raise exception if the Bag manifest file cannot be parsed.""" + with open(manifest_path) as filei: + try: + return { + k.replace('data/', '', 1): v for k, v in + dict(reversed(line.split()) for line in filei).items()} + except Exception as err: + event_outcome_detail_note = ( + 'Unable to perform AIP-level fixity check on AIP {aip_uuid}' + ' because unable to parse manifest file at path' + ' {manifest_path}. Error:\n{error}'.format( + aip_uuid=sip_uuid, + manifest_path=manifest_path, + error=err)) + raise VerifyChecksumsError( + write_premis_event(sip_uuid, checksum_type, 'Fail', + event_outcome_detail_note)) + + +def assert_checksum_types_match(file_, sip_uuid, settings_checksum_type): + """Raise exception if checksum types (i.e., algorithms, e.g., 'sha256') of + the file and the settings do not match. + """ + if file_.checksumtype != settings_checksum_type: + event_outcome_detail_note = ( + 'The checksum type of file {file_uuid} is' + ' {file_checksum_type}; given the current application settings, we' + ' expect it to {settings_checksum_type}'.format( + file_uuid=file_.uuid, + file_checksum_type=file_.checksumtype, + settings_checksum_type=settings_checksum_type)) + raise VerifyChecksumsError( + write_premis_event(sip_uuid, settings_checksum_type, 'Fail', + event_outcome_detail_note)) + + +def get_expected_checksum(file_, sip_uuid, checksum_type, path2checksum, + file_path, manifest_path): + """Raise an exception if an expected checksum cannot be found in the + Bag manifest. + """ + try: + return path2checksum[file_path] + except KeyError: + event_outcome_detail_note = ( + 'Unable to find expected path {expected_path} for file' + ' {file_uuid} in the following mapping from file paths to' + ' checksums that was extracted from Bag manifest file' + ' {manifest_file}: {mapping}'.format( + expected_path=file_path, + file_uuid=file_.uuid, + manifest_file=manifest_path, + mapping=pformat(path2checksum))) + raise VerifyChecksumsError( + write_premis_event(sip_uuid, checksum_type, 'Fail', + event_outcome_detail_note)) + + +def assert_checksums_match(file_, sip_uuid, checksum_type, expected_checksum): + """Raise an exception if checksums do not match.""" + if file_.checksum != expected_checksum: + event_outcome_detail_note = ( + 'The checksum {db_checksum} for file {file_uuid} from the' + ' database did not match the corresponding checksum from the' + ' Bag manifest file {manifest_checksum}'.format( + file_uuid=file_.uuid, + db_checksum=file_.checksum, + manifest_checksum=expected_checksum)) + raise VerifyChecksumsError( + write_premis_event(sip_uuid, checksum_type, 'Fail', + event_outcome_detail_note)) + + +def verify_checksums(bag, sip_uuid): + """Verify that the checksums generated at the beginning of transfer match + those generated near the end of ingest by bag, i.e., "Prepare AIP" + (bagit_v0.0). + """ + checksum_type = get_setting( + 'checksum_type', mcpclient_settings.DEFAULT_CHECKSUM_ALGORITHM) + try: + manifest_path = get_manifest_path(bag, sip_uuid, checksum_type) + path2checksum = parse_manifest(manifest_path, sip_uuid, checksum_type) + verification_count = 0 + for file_ in File.objects.filter(sip_id=sip_uuid): + if not file_.currentlocation.startswith('%SIPDirectory%objects/'): + continue + file_path = file_.currentlocation.replace('%SIPDirectory%', '', 1) + assert_checksum_types_match(file_, sip_uuid, checksum_type) + expected_checksum = get_expected_checksum( + file_, sip_uuid, checksum_type, path2checksum, file_path, + manifest_path) + assert_checksums_match(file_, sip_uuid, checksum_type, + expected_checksum) + verification_count += 1 + except VerifyChecksumsError as err: + print(err) + raise + event_outcome_detail_note = ( + 'All {verification_count} checksums generated at start of transfer' + ' match those generated by BagIt (bag).'.format( + verification_count=verification_count)) + write_premis_event(sip_uuid, checksum_type, 'Pass', + event_outcome_detail_note) + print(event_outcome_detail_note) + + +def verify_aip(): + """Verify the AIP was bagged correctly by extracting it and running + verification on its contents. This is also where we verify the checksums + now that the verifyPREMISChecksums_v0.0 ("Verify checksums generated on + ingest") micro-service has been removed. It was removed because verifying + checksums by calculating them in that MS and then having bagit calculate + them here was redundant. + + sys.argv[1] = UUID + UUID of the SIP, which will become the UUID of the AIP + sys.argv[2] = current location + Full absolute path to the AIP's current location on the local filesystem + """ + + sip_uuid = sys.argv[1] # %sip_uuid% + aip_path = sys.argv[2] # SIPDirectory%%sip_name%-%sip_uuid%.7z + + temp_dir = mcpclient_settings.TEMP_DIRECTORY + + is_uncompressed_aip = os.path.isdir(aip_path) + + if is_uncompressed_aip: + bag = aip_path + else: + try: + extract_dir = os.path.join(temp_dir, sip_uuid) + bag = extract_aip(aip_path, extract_dir) + except Exception: + print('Error extracting AIP at "{}"'.format(aip_path), file=sys.stderr) + return 1 + + verification_commands = [ + '/usr/share/bagit/bin/bag verifyvalid "{}"'.format(bag), + '/usr/share/bagit/bin/bag checkpayloadoxum "{}"'.format(bag), + '/usr/share/bagit/bin/bag verifycomplete "{}"'.format(bag), + '/usr/share/bagit/bin/bag verifypayloadmanifests "{}"'.format(bag), + '/usr/share/bagit/bin/bag verifytagmanifests "{}"'.format(bag), + ] + return_code = 0 + for command in verification_commands: + print("Running test: ", command) + exit_code, _, _ = executeOrRun("command", command, printing=True) + if exit_code != 0: + print("Failed test: ", command, file=sys.stderr) + return_code = 1 + + if return_code == 0: + try: + verify_checksums(bag, sip_uuid) + except VerifyChecksumsError: + return_code = 1 + else: + print('Not verifying checksums because other tests have already' + ' failed.') + + # cleanup + if not is_uncompressed_aip: + try: + shutil.rmtree(extract_dir) + except OSError as err: + print('Failed to remove temporary directory at {extract_dir} which' + ' contains the AIP extracted for verification.' + ' Error:\n{err}'.format(extract_dir=extract_dir, err=err), + file=sys.stderr) + + return return_code + + +if __name__ == '__main__': + import django + django.setup() + from django.conf import settings as mcpclient_settings + logger = get_script_logger("archivematica.mcp.client.verifyAIP") + sys.exit(verify_aip()) diff --git a/src/MCPClient/lib/settings/common.py b/src/MCPClient/lib/settings/common.py index f18327d2c0..54a3452d2a 100644 --- a/src/MCPClient/lib/settings/common.py +++ b/src/MCPClient/lib/settings/common.py @@ -213,3 +213,4 @@ AGENTARCHIVES_CLIENT_TIMEOUT = config.get('agentarchives_client_timeout') SEARCH_ENABLED = config.get('search_enabled') CAPTURE_CLIENT_SCRIPT_OUTPUT = config.get('capture_client_script_output') +DEFAULT_CHECKSUM_ALGORITHM = 'sha256' diff --git a/src/archivematicaCommon/lib/databaseFunctions.py b/src/archivematicaCommon/lib/databaseFunctions.py index 9de7a4feb8..a20c0ccd0a 100755 --- a/src/archivematicaCommon/lib/databaseFunctions.py +++ b/src/archivematicaCommon/lib/databaseFunctions.py @@ -143,18 +143,26 @@ def getAMAgentsForFile(fileUUID): return agents -def insertIntoEvents(fileUUID, eventIdentifierUUID="", eventType="", eventDateTime=None, eventDetail="", eventOutcome="", eventOutcomeDetailNote="", agents=None): - """ - Creates a new entry in the Events table using the supplied arguments. - - :param str fileUUID: The UUID of the file with which this event is associated. Must point to a valid File UUID. - :param str eventIdentifierUUID: The UUID for the event being generated. If not provided, a new UUID will be calculated using the version 4 scheme. +def insertIntoEvents(fileUUID, eventIdentifierUUID="", eventType="", + eventDateTime=None, eventDetail="", eventOutcome="", + eventOutcomeDetailNote="", agents=None): + """Creates a new entry in the Events table using the supplied arguments. + + :param str fileUUID: The UUID of the file with which this event is + associated. Must point to a valid File UUID. + :param str eventIdentifierUUID: The UUID for the event being generated. If + not provided, a new UUID will be calculated using the version 4 scheme. :param str eventType: Can be blank. - :param datetime eventDateTime: The time at which the event occurred. If not provided, the current date will be used. - :param str eventDetail: Can be blank. Will be used in the eventDetail element in the AIP METS. - :param str eventOutcome: Can be blank. Will be used in the eventOutcome element in the AIP METS. - :param str eventOutcomeDetailNote: Can be blank. Will be used in the eventOutcomeDetailNote element in the AIP METS. - :param list agents: List of Agent IDs to associate with this. If None provided, automatically fetches Agents representing Archivematica. + :param datetime eventDateTime: The time at which the event occurred. If not + provided, the current date will be used. + :param str eventDetail: Can be blank. Will be used in the eventDetail + element in the AIP METS. + :param str eventOutcome: Can be blank. Will be used in the eventOutcome + element in the AIP METS. + :param str eventOutcomeDetailNote: Can be blank. Will be used in the + eventOutcomeDetailNote element in the AIP METS. + :param list agents: List of Agent IDs to associate with this. If None + provided, automatically fetches Agents representing Archivematica. """ if eventDateTime is None: eventDateTime = getUTCDate() diff --git a/src/dashboard/src/main/migrations/0051_remove_verify_premis_checksums.py b/src/dashboard/src/main/migrations/0051_remove_verify_premis_checksums.py new file mode 100644 index 0000000000..8f51c08c98 --- /dev/null +++ b/src/dashboard/src/main/migrations/0051_remove_verify_premis_checksums.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +"""Migration to remove the "Verify checksums generated on ingest" (Ingest) +micro-service which is redundant. See +https://github.com/artefactual/archivematica/issues/918. +""" + +from __future__ import unicode_literals + +from django.db import migrations + + +def data_migration(apps, schema_editor): + """Remove the "Verify checksums generated on ingest" micro-service. This is + verifyPREMISChecksums_v0.0 or verifyPREMISChecksums.py. + """ + + ########################################################################### + # Model classes + ########################################################################### + + MicroServiceChainLink = apps.get_model('main', 'MicroServiceChainLink') + TaskConfig = apps.get_model('main', 'TaskConfig') + StandardTaskConfig = apps.get_model('main', 'StandardTaskConfig') + MicroServiceChainLinkExitCode = apps.get_model( + 'main', 'MicroServiceChainLinkExitCode') + + ########################################################################### + # "Remove empty manual normalization directories" now continues on to + # "Bind PIDs?" + ########################################################################### + + MicroServiceChainLink.objects.filter( + id='75fb5d67-5efa-4232-b00b-d85236de0d3f' + ).update( + defaultnextchainlink_id='05357876-a095-4c11-86b5-a7fff01af668') + MicroServiceChainLinkExitCode.objects.filter( + nextmicroservicechainlink='88807d68-062e-4d1a-a2d5-2d198c88d8ca' + ).update( + nextmicroservicechainlink='05357876-a095-4c11-86b5-a7fff01af668') + + ########################################################################### + # Remove "Verify checksums generated on ingest" + ########################################################################### + + StandardTaskConfig.objects.get( + id='4f400b71-37be-49d0-8da3-125abac2bfd0').delete() + TaskConfig.objects.get( + id='ef024cf9-1737-4161-b48a-13b4a8abddcd').delete() + + # The above will result in the following: + # MicroServiceChainLink.objects.get( + # id='88807d68-062e-4d1a-a2d5-2d198c88d8ca').delete() + + ########################################################################### + # Make "Verify AIP" point to script verifyAIP_v1.0 + ########################################################################### + + StandardTaskConfig.objects.filter( + id='ae6b87d8-59c8-4ffa-b417-ce93ab472e74' + ).update( + execute='verifyAIP_v1.0') + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0050_change_pointer_file_filegrpuse'), + ] + + operations = [ + migrations.RunPython(data_migration), + ]