diff --git a/reports/README.md b/reports/README.md new file mode 100644 index 00000000..a19810af --- /dev/null +++ b/reports/README.md @@ -0,0 +1,9 @@ +# Automation Tools Reports Module + +A collection of reporting scripts that can be run independently of the +automation tools or in concert with. + +## Duplicates + +Duplicates can identify duplicate entries across AIPs across your entire AIP +store. See the [README](duplicates/README.md) diff --git a/reports/__init__.py b/reports/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/reports/duplicates/README.md b/reports/duplicates/README.md new file mode 100644 index 00000000..882b3511 --- /dev/null +++ b/reports/duplicates/README.md @@ -0,0 +1,133 @@ +# Duplicates + +Duplicates can identify duplicate entries across AIPs across your entire AIP +store. + +## Configuration + +**Python** + +The duplicates module has its own dependencies. To ensure it can run, please +install these first: + +* `$ sudo pip install -r requirements.txt` + +**Storage Service** + +To configure your report, modify [config.json](config.json) with information +about how to connect to your Storage Service, e.g. +```json +{ + "storage_service_url": "http://127.0.0.1:62081", + "storage_service_user": "test", + "storage_service_api_key": "test" +} +``` + +## Running the script + +Once configured there are a number of ways to run the script. + +* **From the duplicates directory:** `$ python duplicates.py` +* **From the report folder as a module:** `$ python -m duplicates.duplicates` +* **From the automation-tools folder as a module:** `$ python -m reports.duplicates.duplicates` + +## Output + +The tool has two outputs: + +* `aipstore-duplicates.json` +* `aipstore-duplicates.csv` + +A description of those follows: + +* **Json**: Which reports on the packages across which duplicates have been +found and lists duplicate objects organized by checksum. The output might be +useful for developers creating other tooling around this work, e.g. +visualizations, as json is an easy to manipulate standard in most programming +languages. + +The json output is organised as follows: +```json +{ + "manifest_data": { + "{matched-checksum-1}": [ + { + "basename": "{filename}", + "date_modified": "{modified-date}", + "dirname": "{directory-name}", + "filepath": "{relative-path}", + "package_name": "{package-name}", + "package_uuid": "{package-uuid}" + }, + { + "basename": "{filename}", + "date_modified": "{modified-date}", + "dirname": "{directory-name}", + "filepath": "{relative-path}", + "package_name": "{package-name}", + "package_uuid": "{package-uuid}" + }, + { + "basename": "{filename}", + "date_modified": "{modified-date}", + "dirname": "{directory-name}", + "filepath": "{relative-path}", + "package_name": "{package-name}", + "package_uuid": "{package-uuid}" + } + ], + "{matched-checksum-2}": [ + { + "basename": "{filename}", + "date_modified": "{modified-date}", + "dirname": "{directory-name}", + "filepath": "{relative-path}", + "package_name": "{package-name}", + "package_uuid": "{package-uuid}" + }, + { + "basename": "{filename}", + "date_modified": "{modified-date}", + "dirname": "{directory-name}", + "filepath": "{relative-path}", + "package_name": "{package-name}", + "package_uuid": "{package-uuid}" + } + ] + }, + "packages": { + "{package-uuid}": "{package-name}", + "{package-uuid}": "{package-name}" + } +} +``` + +* **CSV**: Which reports the same information but as a 2D representation. The +CSV is ready-made to be manipulated in tools such as +[OpenRefine](http://openrefine.org/). The CSV dynamically resizes depending on +where some rows have different numbers of duplicate files to report. + +## Process followed + +Much of the work done by this package relies on the +[amclient package](https://github.com/artefactual-labs/amclient). The process +used to create a report is as follows: + +1. Retrieve a list of all AIPs across all pipelines. +2. For every AIP download the bag manifest for the AIP (all manifest +permutations are tested so all duplicates are discovered whether you are using +MD5, SHA1 or SHA256 in your Archivematica instances). +3. For every entry in the bag manifest record the checksum, package, and path. +4. Filter objects with matching checksums into a duplicates report. +5. For every matched file in the duplicates report download the package METS +file. +6. Using the METS file augment the report with date_modified information. +(Other data might be added in future). +7. Output the report as JSON to `aipstore-duplicates.json`. +8. Re-format the report to output in a 2D table to `aipstore-duplicates.csv`. + +## Future work + +As a standalone module, the duplicates work could be developed in a number of +ways that might be desirable in an archival appraisal workflow. diff --git a/reports/duplicates/__init__.py b/reports/duplicates/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/reports/duplicates/appconfig.py b/reports/duplicates/appconfig.py new file mode 100644 index 00000000..59f6d462 --- /dev/null +++ b/reports/duplicates/appconfig.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- + +import json +import os + +from amclient import AMClient + + +class AppConfig: + def __init__(self): + """Initialize class.""" + config_file = os.path.join(os.path.dirname(__file__), "config.json") + self._load_config(config_file) + + def _load_config(self, config_file): + """Load our configuration information.""" + with open(config_file) as json_config: + conf = json.load(json_config) + self.storage_service_user = conf.get("storage_service_user") + self.storage_service_api_key = conf.get("storage_service_api_key") + self.storage_service_url = conf.get("storage_service_url") + + def get_am_client(self): + """Return an Archivematica API client to the caller.""" + am = AMClient() + am.ss_url = self.storage_service_url + am.ss_user_name = self.storage_service_user + am.ss_api_key = self.storage_service_api_key + return am diff --git a/reports/duplicates/config.json b/reports/duplicates/config.json new file mode 100644 index 00000000..515498b3 --- /dev/null +++ b/reports/duplicates/config.json @@ -0,0 +1,5 @@ +{ + "storage_service_url": "http://127.0.0.1:62081", + "storage_service_user": "test", + "storage_service_api_key": "test" +} diff --git a/reports/duplicates/duplicates.feature b/reports/duplicates/duplicates.feature new file mode 100644 index 00000000..01f302f7 --- /dev/null +++ b/reports/duplicates/duplicates.feature @@ -0,0 +1,20 @@ +Feature: Identify true duplicates in the Archivematica AIP store. + +Background: Alma uses checksums and archival context to determine "true" duplicate files in their collection (i.e. the context of creation and use is identical). + +Scenario: Generate a duplicate report + Given an AIP has been ingested + When a duplicate checksum is found + Then a duplicates report is generated + +Scenario Outline: Detect a true duplicate file + Given a duplicates report is available + When a file's are equivalent + Then the files are true duplicates + + Examples: + | properties | + | AIP dir_name | + | base_name | + | file_path | + | date_modified | diff --git a/reports/duplicates/duplicates.py b/reports/duplicates/duplicates.py new file mode 100644 index 00000000..641478a3 --- /dev/null +++ b/reports/duplicates/duplicates.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Return all duplicates from the Archivematica Storage Service + +Example usage: + +$:~/git/archivematica/automation-tools$ python -m reports.duplicates.duplicates 2> /dev/null + +Duplicate entries, per algorithm found, will be output to stdout, e.g.: + + { + "manifest_data": { + "078917a9ba3eb290ddb27f97d904cf6e24fec5f62a1986fdf760c07d6d4dd30e": [ + { + "date_modified": "2018-01-31", + "filepath": "data/objects/sci-fi.jpg", + "package_name": "1-588790bd-b9dd-4460-9705-d14f8700dba3", + "package_uuid": "588790bd-b9dd-4460-9705-d14f8700dba3" + }, + { + "date_modified": "2018-01-31", + "filepath": "data/objects/sci-fi.jpg", + "package_name": "2-ba01e3f6-eb6b-4eb5-a8a8-c1ae10200b66", + "package_uuid": "ba01e3f6-eb6b-4eb5-a8a8-c1ae10200b66" + } + ], + "233aa737752ffb64942ca18f03dd6d316957c5b7a0c439e07cdae9963794c315": [ + { + "date_modified": "2018-02-01", + "filepath": "data/objects/garage.jpg", + "package_name": "1-588790bd-b9dd-4460-9705-d14f8700dba3", + "package_uuid": "588790bd-b9dd-4460-9705-d14f8700dba3" + }, + { + "date_modified": "2018-02-01", + "filepath": "data/objects/garage.jpg", + "package_name": "2-ba01e3f6-eb6b-4eb5-a8a8-c1ae10200b66", + "package_uuid": "ba01e3f6-eb6b-4eb5-a8a8-c1ae10200b66" + } + ] + }, + "packages": { + "588790bd-b9dd-4460-9705-d14f8700dba3": "1-588790bd-b9dd-4460-9705-d14f8700dba3", + "ba01e3f6-eb6b-4eb5-a8a8-c1ae10200b66": "2-ba01e3f6-eb6b-4eb5-a8a8-c1ae10200b66" + } + } + +The script utilizes the AM Client module. The fulcrum is the extract_file +endpoint and the bag manifest. An example use, if we call it via the API is: + + http -v --pretty=format \ + GET "http://127.0.0.1:62081/api/v2/file/18c87e78-ea18-4a95-9446-e7100f52ab86/extract_file/?relative_path_to_file=1-18c87e78-ea18-4a95-9446-e7100f52ab86/manifest-sha256.txt" \ + Authorization:"ApiKey test:test" | less + +""" +from __future__ import print_function, unicode_literals + +import logging +import json +import os +import shutil +import sys +from tempfile import mkdtemp + +try: + from . import loggingconfig + from .appconfig import AppConfig + from .parsemets import read_premis_data + from .serialize_to_csv import CSVOut +except ValueError: + import loggingconfig + from appconfig import AppConfig + from parsemets import read_premis_data + from serialize_to_csv import CSVOut + +logging_dir = os.path.dirname(os.path.abspath(__file__)) + + +logger = logging.getLogger("duplicates") +logger.disabled = False + + +class ExtractError(Exception): + """Custom exception for handling extract errors.""" + + +def json_pretty_print(json_string): + """Pretty print a JSON string.""" + return json.dumps(json_string, sort_keys=True, indent=4) + + +def retrieve_file(am, package_uuid, save_as_loc, relative_path): + """Helper function to retrieve our files from the Storage Service.""" + am.package_uuid = package_uuid + am.saveas_filename = save_as_loc + am.relative_path = relative_path + # We can read the response headers if we like. + resp = am.extract_file() + if isinstance(resp, int) or resp is None: + raise ExtractError("Unable to retrieve file from the Storage Service") + return resp + + +def filter_aip_files(filepath, package_uuid): + """Don't return AIP special files as duplicates.""" + filepath = filepath.strip() + uuid_replace = "{{package-uuid}}" + transfer_files = [ + ["data/logs/transfers/", "/logs/filenameCleanup.log"], + ["data/objects/metadata/transfers/", "/directory_tree.txt"], + ["data/logs/transfers/", "/logs/fileFormatIdentification.log"], + ["data/objects/submissionDocumentation/", "/METS.xml"], + ] + aip_files = [ + "data/logs/filenameCleanup.log", + "data/README.html", + "data/logs/fileFormatIdentification.log", + "data/METS.{{package-uuid}}.xml", + ] + for file_ in transfer_files: + if file_[0] in filepath and file_[1] in filepath: + logger.info("Filtering: %s", filepath) + return True + for file_ in aip_files: + if file_.replace(uuid_replace, package_uuid) == filepath: + logger.info("Filtering: %s", filepath) + return True + if file_ == filepath: + logger.info("Filtering: %s", filepath) + return True + return False + + +def augment_data(package_uuid, duplicate_report, date_info): + manifest_data = duplicate_report.get("manifest_data", {}) + for key, value in manifest_data.items(): + for package in value: + if package_uuid != package.get("package_uuid", ""): + continue + for dates in date_info: + path_ = package.get("filepath", "").strip(os.path.join("data", "")) + if path_ == dates.get("filepath", ""): + package["date_modified"] = dates.get("date_modified", "") + + +def read_mets(mets_loc): + """test...""" + return read_premis_data(mets_loc) + + +def retrieve_mets(am, duplicate_report, temp_dir): + """Retrieve METS from our packages with duplicate files and retrieve useful + information. + """ + for key, value in duplicate_report.get("packages", {}).items(): + """do nothing""" + package_uuid = key + package_name = value + mets = "{}/data/METS.{}.xml".format(package_name, package_uuid) + save_as_loc = os.path.join(temp_dir, mets.replace("/", "-")) + if not os.path.exists(save_as_loc): + try: + retrieve_file(am, package_uuid, save_as_loc, mets) + data = read_mets(save_as_loc) + augment_data(package_uuid, duplicate_report, data) + except ExtractError as err: + logger.info(err) + continue + + +def filter_duplicates(duplicate_report): + """Filter our report for packages containing duplicates only.""" + dupes = dict(duplicate_report.get("manifest_data", {})) + packages = {} + for key, values in dupes.items(): + if len(values) > 1: + for entry in values: + packages[entry.get("package_uuid")] = entry.get("package_name") + else: + try: + duplicate_report.get("manifest_data", {}).pop(key) + logger.info("Popped checksum: %s", key) + except (AttributeError, KeyError): + raise ExtractError("Error filtering report for duplicates") + duplicate_report["packages"] = packages + return duplicate_report + + +def output_report(duplicate_report): + """Provide mechanisms to output different serializations.""" + with open("aipstore-duplicates.json", "w") as json_file: + json_file.write(json_pretty_print(duplicate_report)) + print(json_pretty_print(duplicate_report)) + CSVOut.csv_out(duplicate_report, "aipstore-duplicates.csv") + + +def main(): + """Script's primary entry-point.""" + temp_dir = mkdtemp() + loggingconfig.setup("INFO", os.path.join(logging_dir, "report.log")) + am = AppConfig().get_am_client() + # Maintain state of all values across the aipstore. + duplicate_report = {} + manifest_data = {} + # Checksum algorithms to test for. + checksum_algorithms = ("md5", "sha1", "sha256") + # Get all AIPS that the storage service knows about. + aips = am.aips() + for aip in aips: + package_name = os.path.basename(aip.get("current_path")).replace(".7z", "") + package_uuid = aip.get("uuid") + for algorithm in checksum_algorithms: + # Store our manifest somewhere. + relative_path = "{}/manifest-{}.txt".format(package_name, algorithm) + save_path = "{}-manifest-{}.txt".format(package_name, algorithm) + save_as_loc = os.path.join(temp_dir, save_path) + try: + retrieve_file(am, package_uuid, save_as_loc, relative_path) + except ExtractError: + logger.info("No result for algorithm: %s", algorithm) + continue + # Our dictionary keys are checksums and all filename entries with + # the same checksum are appended to create an array. If the array + # at the end is greater than one, we have duplicate files. + with open(save_as_loc, "r") as manifest_extract: + for line in manifest_extract: + if line.strip() != "": + # Attach our transfer name so we know where our duplicates + # are. Turn into an array to create or append to our dict + # entry. + checksum, filepath = line.split(" ", 1) + if not filter_aip_files(filepath, package_uuid): + entry = {} + filepath = filepath.strip() + entry["package_uuid"] = am.package_uuid.strip() + entry["package_name"] = package_name.strip() + entry["filepath"] = filepath + entry["basename"] = os.path.basename(filepath) + entry["dirname"] = os.path.dirname(filepath) + manifest_data.setdefault(checksum.strip(), []) + manifest_data[checksum].append(entry) + duplicate_report["manifest_data"] = manifest_data + duplicate_report = filter_duplicates(duplicate_report) + retrieve_mets(am, duplicate_report, temp_dir) + # Save to JSON and CSV. + output_report(duplicate_report) + # Cleanup our temporary folder. + shutil.rmtree(temp_dir) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/reports/duplicates/loggingconfig.py b/reports/duplicates/loggingconfig.py new file mode 100644 index 00000000..c29d3247 --- /dev/null +++ b/reports/duplicates/loggingconfig.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +import logging +import logging.config # Has to be imported separately + + +def setup(log_level, log_file_name): + """Configure the logging system.""" + # Log format string for flake8 compliance + log_fmt = "%(levelname)-8s %(asctime)s " "%(filename)s:%(lineno)-4s %(message)s" + + dict_config = { + "version": 1, + "disable_existing_loggers": False, + "formatters": {"default": {"format": log_fmt, "datefmt": "%Y-%m-%d %H:%M:%S"}}, + "handlers": { + "console": {"class": "logging.StreamHandler", "formatter": "default"}, + "file": { + "class": "logging.handlers.RotatingFileHandler", + "formatter": "default", + "filename": log_file_name, + "backupCount": 2, + "maxBytes": 10 * 1024, + "delay": True, # Ony write to file on first byte emitted. + }, + }, + "loggers": { + "duplicates": {"level": log_level, "handlers": ["console", "file"]} + }, + } + + logging.config.dictConfig(dict_config) + + +def set_log_level(log_level, quiet, verbose): + log_levels = {2: "ERROR", 1: "WARNING", 0: "INFO", -1: "DEBUG"} + if log_level is None: + level = quiet - verbose + level = max(level, -1) # No smaller than -1 + level = min(level, 2) # No larger than 2 + return log_levels[level] + return log_level diff --git a/reports/duplicates/parsemets.py b/reports/duplicates/parsemets.py new file mode 100644 index 00000000..628f1cd4 --- /dev/null +++ b/reports/duplicates/parsemets.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function, unicode_literals +import lxml +import logging +import metsrw +import sys + + +def _load_mets(mets_file): + """Read the METS file at the path provided.""" + try: + mets = metsrw.METSDocument.fromfile(mets_file) # Reads a file + return mets + except lxml.etree.XMLSyntaxError as e: + logging.error("METS %s", e) + sys.exit(1) + except IOError as e: + logging.error("File does not exist %s", e) + sys.exit(1) + + +def read_premis_data(mets_file): + """Read PREMIS information and retrieve information of some utility.""" + mets = _load_mets(mets_file) + info = [] + for mets_entry in mets.all_files(): + filepath = mets_entry.path + if mets_entry.type != "Item": + continue + objs = mets_entry.get_premis_objects() + for obj in objs: + entry = {} + entry["date_modified"] = obj.date_created_by_application + entry["filepath"] = filepath + info.append(entry) + return info diff --git a/reports/duplicates/requirements.txt b/reports/duplicates/requirements.txt new file mode 100644 index 00000000..ea77c2d8 --- /dev/null +++ b/reports/duplicates/requirements.txt @@ -0,0 +1 @@ +-r requirements/production.txt diff --git a/reports/duplicates/requirements/base.txt b/reports/duplicates/requirements/base.txt new file mode 100644 index 00000000..73d0e493 --- /dev/null +++ b/reports/duplicates/requirements/base.txt @@ -0,0 +1,4 @@ +amclient +pandas==0.24.2 +metsrw==0.2.0 + diff --git a/reports/duplicates/requirements/local.txt b/reports/duplicates/requirements/local.txt new file mode 100644 index 00000000..d9c08a14 --- /dev/null +++ b/reports/duplicates/requirements/local.txt @@ -0,0 +1,5 @@ +-r base.txt +pytest +vcrpy>=1.0.0 +flake8==3.4.1 +flake8-import-order==0.13 diff --git a/reports/duplicates/requirements/production.txt b/reports/duplicates/requirements/production.txt new file mode 100644 index 00000000..42b2763f --- /dev/null +++ b/reports/duplicates/requirements/production.txt @@ -0,0 +1,2 @@ +# There should be no dependency in production that isn't in development. +-r base.txt diff --git a/reports/duplicates/serialize_to_csv.py b/reports/duplicates/serialize_to_csv.py new file mode 100644 index 00000000..fd174630 --- /dev/null +++ b/reports/duplicates/serialize_to_csv.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +from pandas import DataFrame + + +class CSVOut: + """Conveniently wrap CSV output capability.""" + + @staticmethod + def csv_out(duplicate_report, filename): + """Output a CSV using Pandas and a bit of magic.""" + dupes = duplicate_report.get("manifest_data", {}) + cols = 0 + arr = [ + "a_dir_name", + "b_base_name", + "c_file_path", + "e_date_modified", + "f_package_name", + "g_package_uuid", + ] + rows = [] + headers = None + for key, value in dupes.items(): + cols = max(cols, len(value)) + # Create headers for our spreadsheet. + headers = arr * cols + for i in range(len(headers)): + headers[i] = "{}_{}".format(headers[i], str(i).zfill(2)) + # Make sure that checksum is the first and only non-duplicated value. + headers = ["Checksum"] + headers + for key, value in dupes.items(): + records = [] + for prop in value: + record = [] + record.append(prop.get("dirname", "NaN")) + record.append(prop.get("basename", "NaN")) + record.append(prop.get("filepath", "NaN")) + record.append(prop.get("date_modified", "NaN")) + record.append(prop.get("package_name", "NaN")) + record.append(prop.get("package_uuid", "NaN")) + records = records + record + # Fill blank spaces in row. Might also be possible as a Pandas series. + space = cols * len(arr) - len(records) + if space: + filler = ["NaN"] * space + records = records + filler + # Create a checksum entry for our spreadsheet. + records = [key] + records + # Create a dict from two lists. + dictionary = dict(zip(headers, records)) + rows.append(dictionary) + df = DataFrame(columns=headers) + for entry in rows: + df = df.append(entry, ignore_index=True) + # Sort the columns in alphabetical order to pair similar headers. + cols = sorted(df.columns.tolist()) + cols_no_suffix = [x.rsplit("_", 1)[0] for x in cols] + cols_no_prefix = [x.split("_", 1)[1] for x in cols_no_suffix if "_" in x] + cols_no_prefix = ["Checksum"] + cols_no_prefix + df = df[cols] + df.to_csv(filename, index=None, header=cols_no_prefix, encoding="utf8")