-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Compare an accruals location to an aip store
accruals->aips functionality introduces a digital object that has the ability to compare itself to other objects of the same Class. Where filepaths and checksums and dates match, the object is identical. Where they don't, users can use modulo (%) to identify where the object isn't in fact identical. Much of the benefit of this work is derived from the nature of the AIP structure imposed on a digital transfer. That, however, might make 'true'-duplicate matching slightly more rare than just identifying checksum duplicates.
- Loading branch information
1 parent
a0d4eee
commit aaca9fa
Showing
8 changed files
with
451 additions
and
49 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import print_function, unicode_literals | ||
|
||
import copy | ||
import logging | ||
import os | ||
import sys | ||
|
||
try: | ||
from .appconfig import AppConfig | ||
from .digital_object import DigitalObject | ||
from . import duplicates | ||
from . import loggingconfig | ||
from .serialize_to_csv import CSVOut | ||
from . import utils | ||
except ValueError: | ||
from appconfig import AppConfig | ||
from digital_object import DigitalObject | ||
import duplicates | ||
import loggingconfig | ||
from serialize_to_csv import CSVOut | ||
import utils | ||
|
||
logging_dir = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
logger = logging.getLogger("accruals") | ||
logger.disabled = False | ||
|
||
# Location purpose = Transfer Source (TS) | ||
location_purpose = "TS" | ||
default_location = "accruals" | ||
|
||
|
||
# Do something with this... | ||
DOCKER = True | ||
|
||
# Store our appraisal paths. | ||
accrual_paths = [] | ||
|
||
|
||
def create_manifest(aip_index, accrual_objs): | ||
"""do something.""" | ||
dupes = [] | ||
aip_obj_hashes = aip_index.get(duplicates.MANIFEST_DATA) | ||
for accrual_obj in accrual_objs: | ||
for accrual_hash in accrual_obj.hashes: | ||
if accrual_hash in aip_obj_hashes.keys(): | ||
for _, aip_items in aip_obj_hashes.items(): | ||
for item in aip_items: | ||
if accrual_obj == item: | ||
cp = copy.copy(accrual_obj) | ||
cp.package_name = item.package_name | ||
dupes.append(cp) | ||
# Only need one hash to match then break. | ||
# May also be redundant as we only have one hash from the | ||
# bag manifests... | ||
break | ||
return dupes | ||
|
||
|
||
def create_comparison_obj(transfer_path): | ||
"""Do something.""" | ||
transfer_arr = [] | ||
for root, dirs, files in os.walk(transfer_path, topdown=True): | ||
for name in files: | ||
file_ = os.path.join(root, name) | ||
if os.path.isfile(file_): | ||
transfer_arr.append(DigitalObject(file_, transfer_path)) | ||
return transfer_arr | ||
|
||
|
||
def stat_transfers(accruals_path, all_transfers): | ||
"""Retrieve all transfer paths and make a request to generate statistics | ||
about all the objects in that transfer path. | ||
""" | ||
aip_index = duplicates.retrieve_aip_index() | ||
reports = [] | ||
transfers = [] | ||
for transfer in all_transfers: | ||
transfer_home = os.path.join(accruals_path, transfer) | ||
if DOCKER: | ||
transfer_home = utils.get_docker_path(transfer_home) | ||
objs = create_comparison_obj(transfer_home) | ||
transfers.append(objs) | ||
reports.append({transfer: create_manifest(aip_index, objs)}) | ||
CSVOut.stat_manifests(aip_index, transfers) | ||
CSVOut.csv_out(reports, "") | ||
|
||
|
||
def main(location=default_location): | ||
"""Primary entry point for this script.""" | ||
|
||
am = AppConfig().get_am_client() | ||
sources = am.list_storage_locations() | ||
|
||
accruals = False | ||
for source in sources.get("objects"): | ||
if ( | ||
source.get("purpose") == location_purpose | ||
and source.get("description") == location | ||
): | ||
"""do something.""" | ||
am.transfer_source = source.get("uuid") | ||
am.transfer_path = source.get("path") | ||
accruals = True | ||
if not accruals: | ||
logger.info("Exiting. No transfer source: {}".format(location)) | ||
sys.exit() | ||
|
||
# All transfer directories. Assumption is the same as Archivematica that | ||
# each transfer is organized into a single directory at this level. | ||
all_transfers = am.transferables().get("directories") | ||
stat_transfers(am.transfer_path, all_transfers) | ||
|
||
|
||
if __name__ == "__main__": | ||
loggingconfig.setup("INFO", os.path.join(logging_dir, "report.log")) | ||
source = default_location | ||
try: | ||
source = sys.argv[1:][0] | ||
logger.error("Attempting to find transfers at: %s", source) | ||
except IndexError: | ||
pass | ||
sys.exit(main(source)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
"""Digital object class to help with matching.""" | ||
|
||
import json | ||
import os | ||
import time | ||
|
||
try: | ||
from . import hashutils | ||
except ImportError: | ||
import hashutils | ||
|
||
|
||
class DigitalObjectException(Exception): | ||
"""If there's a problem raise this.""" | ||
|
||
|
||
class DigitalObject(object): | ||
|
||
# Object members. | ||
basename = None | ||
date_modified = None | ||
dirname = None | ||
filepath = None | ||
hashes = None | ||
package_uuid = None | ||
package_name = None | ||
|
||
def __init__(self, path=None, transfer_path=None): | ||
"""Populate the digital object metadata. If we don't supply a path | ||
we'll just return an empty object to be populated on our own terms. | ||
""" | ||
if not path: | ||
self.basename = None | ||
self.date_modified = None | ||
self.dirname = None | ||
self.filepath = None | ||
self.hashes = [] | ||
self.package_uuid = None | ||
self.package_name = None | ||
|
||
if path: | ||
if not transfer_path: | ||
raise DigitalObjectException("Transfer path isn't set") | ||
# Construct path as if it is in a Bag object. | ||
comparison_path = path.replace( | ||
transfer_path, os.path.join("data", "objects") | ||
) | ||
self.filepath = comparison_path | ||
self.set_basename(comparison_path) | ||
self.set_dirname(comparison_path) | ||
self.hashes = hashutils.hash(path) | ||
self.date_modified = self.get_timestamp(path) | ||
|
||
def set_basename(self, path): | ||
"""do something.""" | ||
self.basename = os.path.basename(path) | ||
|
||
def set_dirname(self, path): | ||
"""do something.""" | ||
self.dirname = os.path.dirname(path) | ||
|
||
def as_dict(self): | ||
return self.__dict__ | ||
|
||
def __str__(self): | ||
"""Let's override this!""" | ||
return json.dumps( | ||
self.__dict__, sort_keys=True, indent=4, separators=(",", ": ") | ||
) | ||
|
||
def __eq__(self, other): | ||
"""Comparison operator for the digital object class. If two hashes | ||
match, and the given file path, we will return True. | ||
""" | ||
ret = False | ||
for key in self.hashes.keys(): | ||
if key in other.hashes.keys(): | ||
ret = True | ||
break | ||
if self.filepath != other.filepath: | ||
ret = False | ||
if self.date_modified != other.date_modified: | ||
ret = False | ||
return ret | ||
|
||
def __mod__(self, other): | ||
"""Modulo operator for the digital object class. If two hashes match, | ||
and the given file-path, then return zero. If there is any partial | ||
match, then return basis information. % is potentially useful for | ||
debugging, or enhanced reporting. | ||
""" | ||
if self.__eq__(other): | ||
return 0 | ||
# ret is False, repurpose to return basis information. | ||
ret = "" | ||
if self.date_modified == other.date_modified: | ||
msg = "date modified match" | ||
ret = self.__concat_basis__(ret, msg) | ||
if self.basename == other.basename: | ||
msg = "filename match" | ||
ret = self.__concat_basis__(ret, msg) | ||
if self.dirname == other.dirname: | ||
msg = "directory name match" | ||
ret = self.__concat_basis__(ret, msg) | ||
if not ret: | ||
return "No matching components" | ||
return ret | ||
|
||
@staticmethod | ||
def __concat_basis__(ret, msg): | ||
"""Helper function to bring basis information together usefully.""" | ||
if ret: | ||
return "{}; {}".format(ret, msg) | ||
return msg | ||
|
||
@staticmethod | ||
def get_timestamp(path): | ||
"""do something.""" | ||
return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(path))) |
Oops, something went wrong.