Skip to content

Commit

Permalink
Compare an accruals location to an aip store
Browse files Browse the repository at this point in the history
accruals->aips functionality introduces a digital object that has
the ability to compare itself to other objects of the same Class.

Where filepaths and checksums and dates match, the object is
identical. Where they don't, users can use modulo (%) to identify
where the object isn't in fact identical.

Much of the benefit of this work is derived from the nature of the
AIP structure imposed on a digital transfer.

That, however, might make 'true'-duplicate matching slightly more
rare than just identifying checksum duplicates.
  • Loading branch information
ross-spencer committed Jul 3, 2019
1 parent a0d4eee commit aaca9fa
Show file tree
Hide file tree
Showing 8 changed files with 451 additions and 49 deletions.
126 changes: 126 additions & 0 deletions reports/duplicates/accruals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import print_function, unicode_literals

import copy
import logging
import os
import sys

try:
from .appconfig import AppConfig
from .digital_object import DigitalObject
from . import duplicates
from . import loggingconfig
from .serialize_to_csv import CSVOut
from . import utils
except ValueError:
from appconfig import AppConfig
from digital_object import DigitalObject
import duplicates
import loggingconfig
from serialize_to_csv import CSVOut
import utils

logging_dir = os.path.dirname(os.path.abspath(__file__))

logger = logging.getLogger("accruals")
logger.disabled = False

# Location purpose = Transfer Source (TS)
location_purpose = "TS"
default_location = "accruals"


# Do something with this...
DOCKER = True

# Store our appraisal paths.
accrual_paths = []


def create_manifest(aip_index, accrual_objs):
"""do something."""
dupes = []
aip_obj_hashes = aip_index.get(duplicates.MANIFEST_DATA)
for accrual_obj in accrual_objs:
for accrual_hash in accrual_obj.hashes:
if accrual_hash in aip_obj_hashes.keys():
for _, aip_items in aip_obj_hashes.items():
for item in aip_items:
if accrual_obj == item:
cp = copy.copy(accrual_obj)
cp.package_name = item.package_name
dupes.append(cp)
# Only need one hash to match then break.
# May also be redundant as we only have one hash from the
# bag manifests...
break
return dupes


def create_comparison_obj(transfer_path):
"""Do something."""
transfer_arr = []
for root, dirs, files in os.walk(transfer_path, topdown=True):
for name in files:
file_ = os.path.join(root, name)
if os.path.isfile(file_):
transfer_arr.append(DigitalObject(file_, transfer_path))
return transfer_arr


def stat_transfers(accruals_path, all_transfers):
"""Retrieve all transfer paths and make a request to generate statistics
about all the objects in that transfer path.
"""
aip_index = duplicates.retrieve_aip_index()
reports = []
transfers = []
for transfer in all_transfers:
transfer_home = os.path.join(accruals_path, transfer)
if DOCKER:
transfer_home = utils.get_docker_path(transfer_home)
objs = create_comparison_obj(transfer_home)
transfers.append(objs)
reports.append({transfer: create_manifest(aip_index, objs)})
CSVOut.stat_manifests(aip_index, transfers)
CSVOut.csv_out(reports, "")


def main(location=default_location):
"""Primary entry point for this script."""

am = AppConfig().get_am_client()
sources = am.list_storage_locations()

accruals = False
for source in sources.get("objects"):
if (
source.get("purpose") == location_purpose
and source.get("description") == location
):
"""do something."""
am.transfer_source = source.get("uuid")
am.transfer_path = source.get("path")
accruals = True
if not accruals:
logger.info("Exiting. No transfer source: {}".format(location))
sys.exit()

# All transfer directories. Assumption is the same as Archivematica that
# each transfer is organized into a single directory at this level.
all_transfers = am.transferables().get("directories")
stat_transfers(am.transfer_path, all_transfers)


if __name__ == "__main__":
loggingconfig.setup("INFO", os.path.join(logging_dir, "report.log"))
source = default_location
try:
source = sys.argv[1:][0]
logger.error("Attempting to find transfers at: %s", source)
except IndexError:
pass
sys.exit(main(source))
122 changes: 122 additions & 0 deletions reports/duplicates/digital_object.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Digital object class to help with matching."""

import json
import os
import time

try:
from . import hashutils
except ImportError:
import hashutils


class DigitalObjectException(Exception):
"""If there's a problem raise this."""


class DigitalObject(object):

# Object members.
basename = None
date_modified = None
dirname = None
filepath = None
hashes = None
package_uuid = None
package_name = None

def __init__(self, path=None, transfer_path=None):
"""Populate the digital object metadata. If we don't supply a path
we'll just return an empty object to be populated on our own terms.
"""
if not path:
self.basename = None
self.date_modified = None
self.dirname = None
self.filepath = None
self.hashes = []
self.package_uuid = None
self.package_name = None

if path:
if not transfer_path:
raise DigitalObjectException("Transfer path isn't set")
# Construct path as if it is in a Bag object.
comparison_path = path.replace(
transfer_path, os.path.join("data", "objects")
)
self.filepath = comparison_path
self.set_basename(comparison_path)
self.set_dirname(comparison_path)
self.hashes = hashutils.hash(path)
self.date_modified = self.get_timestamp(path)

def set_basename(self, path):
"""do something."""
self.basename = os.path.basename(path)

def set_dirname(self, path):
"""do something."""
self.dirname = os.path.dirname(path)

def as_dict(self):
return self.__dict__

def __str__(self):
"""Let's override this!"""
return json.dumps(
self.__dict__, sort_keys=True, indent=4, separators=(",", ": ")
)

def __eq__(self, other):
"""Comparison operator for the digital object class. If two hashes
match, and the given file path, we will return True.
"""
ret = False
for key in self.hashes.keys():
if key in other.hashes.keys():
ret = True
break
if self.filepath != other.filepath:
ret = False
if self.date_modified != other.date_modified:
ret = False
return ret

def __mod__(self, other):
"""Modulo operator for the digital object class. If two hashes match,
and the given file-path, then return zero. If there is any partial
match, then return basis information. % is potentially useful for
debugging, or enhanced reporting.
"""
if self.__eq__(other):
return 0
# ret is False, repurpose to return basis information.
ret = ""
if self.date_modified == other.date_modified:
msg = "date modified match"
ret = self.__concat_basis__(ret, msg)
if self.basename == other.basename:
msg = "filename match"
ret = self.__concat_basis__(ret, msg)
if self.dirname == other.dirname:
msg = "directory name match"
ret = self.__concat_basis__(ret, msg)
if not ret:
return "No matching components"
return ret

@staticmethod
def __concat_basis__(ret, msg):
"""Helper function to bring basis information together usefully."""
if ret:
return "{}; {}".format(ret, msg)
return msg

@staticmethod
def get_timestamp(path):
"""do something."""
return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(path)))
Loading

0 comments on commit aaca9fa

Please sign in to comment.