-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Compare an accruals location to an aip store
This commit introduces an accruals->aips comparison capability. Digital objects in an accruals folder can now be compared to the contents of an AIP store. Where filepaths and checksums and dates match, the object is considered to be identical (a true duplicate). Where they don't, users can use modulo (%) to identify where the object isn't in fact identical. Much of the benefit of this work is derived from the nature of the AIP structure imposed on a digital transfer. Once the comparison is complete, three reports are output in CSV format: * True-duplicates. * Near-duplicates (checksums match, but other components might not). * Non-duplicates. Additionally a summary report output in JSON.
- Loading branch information
1 parent
a0d4eee
commit e3459b2
Showing
9 changed files
with
563 additions
and
148 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import print_function, unicode_literals | ||
|
||
import copy | ||
import logging | ||
import os | ||
import sys | ||
|
||
try: | ||
from .appconfig import AppConfig | ||
from .digital_object import DigitalObject | ||
from . import duplicates | ||
from . import loggingconfig | ||
from .serialize_to_csv import CSVOut | ||
from . import utils | ||
except (ValueError, ImportError): | ||
from appconfig import AppConfig | ||
from digital_object import DigitalObject | ||
import duplicates | ||
import loggingconfig | ||
from serialize_to_csv import CSVOut | ||
import utils | ||
|
||
logging_dir = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
logger = logging.getLogger("accruals") | ||
logger.disabled = False | ||
|
||
# Location purpose = Transfer Source (TS) | ||
location_purpose = "TS" | ||
default_location = "accruals" | ||
|
||
|
||
# Do something with this... | ||
DOCKER = True | ||
|
||
# Store our appraisal paths. | ||
accrual_paths = [] | ||
|
||
|
||
def create_manifest(aip_index, accrual_objs): | ||
"""do something.""" | ||
dupes = [] | ||
near_matches = [] | ||
non_matches = [] | ||
aip_obj_hashes = aip_index.get(duplicates.MANIFEST_DATA) | ||
for accrual_obj in accrual_objs: | ||
for accrual_hash in accrual_obj.hashes: | ||
if accrual_hash in aip_obj_hashes.keys(): | ||
for _, aip_items in aip_obj_hashes.items(): | ||
for aip_item in aip_items: | ||
if accrual_obj == aip_item: | ||
accrual_obj.flag = True | ||
cp = copy.copy(accrual_obj) | ||
cp.package_name = aip_item.package_name | ||
dupes.append(cp) | ||
else: | ||
diff = accrual_obj % aip_item | ||
if ( | ||
diff == "No matching components" | ||
or "checksum match" not in diff | ||
): | ||
"""Don't output.""" | ||
continue | ||
accrual_obj.flag = True | ||
cp1 = copy.copy(accrual_obj) | ||
cp2 = copy.copy(aip_item) | ||
near_matches.append([cp1, cp2]) | ||
# Only need one hash to match then break. | ||
# May also be redundant as we only have one hash from the | ||
# bag manifests... | ||
break | ||
for accrual_obj in accrual_objs: | ||
if accrual_obj.flag is False: | ||
cp = copy.copy(accrual_obj) | ||
if cp not in non_matches: | ||
non_matches.append(cp) | ||
return dupes, near_matches, non_matches | ||
|
||
|
||
def create_comparison_obj(transfer_path): | ||
"""Do something.""" | ||
transfer_arr = [] | ||
for root, dirs, files in os.walk(transfer_path, topdown=True): | ||
for name in files: | ||
file_ = os.path.join(root, name) | ||
if os.path.isfile(file_): | ||
transfer_arr.append(DigitalObject(file_, transfer_path)) | ||
return transfer_arr | ||
|
||
|
||
def stat_transfers(accruals_path, all_transfers): | ||
"""Retrieve all transfer paths and make a request to generate statistics | ||
about all the objects in that transfer path. | ||
""" | ||
aip_index = duplicates.retrieve_aip_index() | ||
dupe_reports = [] | ||
near_reports = [] | ||
no_match_reports = [] | ||
transfers = [] | ||
for transfer in all_transfers: | ||
transfer_home = os.path.join(accruals_path, transfer) | ||
if DOCKER: | ||
transfer_home = utils.get_docker_path(transfer_home) | ||
objs = create_comparison_obj(transfer_home) | ||
transfers.append(objs) | ||
match_manifest, near_manifest, no_match_manifest = create_manifest( | ||
aip_index, objs | ||
) | ||
if match_manifest: | ||
dupe_reports.append({transfer: match_manifest}) | ||
if near_manifest: | ||
near_reports.append({transfer: near_manifest}) | ||
if no_match_manifest: | ||
no_match_reports.append({transfer: no_match_manifest}) | ||
CSVOut.stat_manifests(aip_index, transfers) | ||
if dupe_reports: | ||
CSVOut.dupe_csv_out(dupe_reports, "") | ||
if near_reports: | ||
CSVOut.near_csv_out(near_reports, "") | ||
if no_match_reports: | ||
CSVOut.no_match_csv_out(no_match_reports, "") | ||
|
||
|
||
def main(location=default_location): | ||
"""Primary entry point for this script.""" | ||
|
||
am = AppConfig().get_am_client() | ||
sources = am.list_storage_locations() | ||
|
||
accruals = False | ||
for source in sources.get("objects"): | ||
if ( | ||
source.get("purpose") == location_purpose | ||
and source.get("description") == location | ||
): | ||
"""do something.""" | ||
am.transfer_source = source.get("uuid") | ||
am.transfer_path = source.get("path") | ||
accruals = True | ||
if not accruals: | ||
logger.info("Exiting. No transfer source: {}".format(location)) | ||
sys.exit() | ||
|
||
# All transfer directories. Assumption is the same as Archivematica that | ||
# each transfer is organized into a single directory at this level. | ||
all_transfers = am.transferables().get("directories") | ||
stat_transfers(am.transfer_path, all_transfers) | ||
|
||
|
||
if __name__ == "__main__": | ||
loggingconfig.setup("INFO", os.path.join(logging_dir, "report.log")) | ||
source = default_location | ||
try: | ||
source = sys.argv[1:][0] | ||
logger.error("Attempting to find transfers at: %s", source) | ||
except IndexError: | ||
pass | ||
sys.exit(main(source)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
"""Digital object class to help with matching.""" | ||
|
||
import json | ||
import os | ||
import time | ||
|
||
try: | ||
from . import hashutils | ||
except (ValueError, ImportError): | ||
import hashutils | ||
|
||
|
||
class DigitalObjectException(Exception): | ||
"""If there's a problem raise this.""" | ||
|
||
|
||
class DigitalObject(object): | ||
|
||
# Object members. | ||
basename = None | ||
date_modified = None | ||
dirname = None | ||
filepath = None | ||
hashes = None | ||
package_uuid = None | ||
package_name = None | ||
|
||
def __init__(self, path=None, transfer_path=None): | ||
"""Populate the digital object metadata. If we don't supply a path | ||
we'll just return an empty object to be populated on our own terms. | ||
""" | ||
if not path: | ||
self.basename = None | ||
self.date_modified = None | ||
self.dirname = None | ||
self.filepath = None | ||
self.hashes = [] | ||
self.package_uuid = None | ||
self.package_name = None | ||
self.flag = False | ||
|
||
if path: | ||
if not transfer_path: | ||
raise DigitalObjectException("Transfer path isn't set") | ||
# Construct path as if it is in a Bag object. | ||
comparison_path = path.replace( | ||
transfer_path, os.path.join("data", "objects") | ||
) | ||
self.filepath = comparison_path | ||
self.set_basename(comparison_path) | ||
self.set_dirname(comparison_path) | ||
self.hashes = hashutils.hash(path) | ||
self.date_modified = self.get_timestamp(path) | ||
self.flag = False | ||
|
||
def set_basename(self, path): | ||
"""do something.""" | ||
self.basename = os.path.basename(path) | ||
|
||
def set_dirname(self, path): | ||
"""do something.""" | ||
self.dirname = os.path.dirname(path) | ||
|
||
def as_dict(self): | ||
return self.__dict__ | ||
|
||
def __str__(self): | ||
"""Let's override this!""" | ||
return json.dumps( | ||
self.__dict__, sort_keys=True, indent=4, separators=(",", ": ") | ||
) | ||
|
||
def __eq__(self, other): | ||
"""Comparison operator for the digital object class. If two hashes | ||
match, and the given file path, we will return True. | ||
""" | ||
ret = False | ||
for key in self.hashes.keys(): | ||
if key in other.hashes.keys(): | ||
ret = True | ||
break | ||
if self.filepath != other.filepath: | ||
ret = False | ||
if self.date_modified != other.date_modified: | ||
ret = False | ||
return ret | ||
|
||
def __mod__(self, other): | ||
"""Modulo operator for the digital object class. If two hashes match, | ||
and the given file-path, then return zero. If there is any partial | ||
match, then return basis information. % is potentially useful for | ||
debugging, or enhanced reporting. | ||
""" | ||
if self.__eq__(other): | ||
return 0 | ||
# ret is False, repurpose to return basis information. | ||
ret = "" | ||
for key in self.hashes.keys(): | ||
if key in other.hashes.keys(): | ||
msg = "checksum match" | ||
ret = self.__concat_basis__(ret, msg) | ||
break | ||
if self.date_modified == other.date_modified: | ||
msg = "date modified match" | ||
ret = self.__concat_basis__(ret, msg) | ||
if self.basename == other.basename: | ||
msg = "filename match" | ||
ret = self.__concat_basis__(ret, msg) | ||
if self.dirname == other.dirname: | ||
msg = "directory name match" | ||
ret = self.__concat_basis__(ret, msg) | ||
if not ret: | ||
return "No matching components" | ||
return ret | ||
|
||
@staticmethod | ||
def __concat_basis__(ret, msg): | ||
"""Helper function to bring basis information together usefully.""" | ||
if ret: | ||
return "{}; {}".format(ret, msg) | ||
return msg | ||
|
||
@staticmethod | ||
def get_timestamp(path): | ||
"""do something.""" | ||
return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(path))) |
Oops, something went wrong.