From f3a16e561b12935130b81bf3e92efa33b6c90749 Mon Sep 17 00:00:00 2001 From: Hector Akamine Date: Wed, 18 Nov 2015 08:51:58 -0800 Subject: [PATCH] transfers: add config file for log/db/pid file paths - Get log/db/pid file information from a config file specified in the command line - Add six as dependency (for configparser) - Provide example config files in etc/ directory - Script needs to be called as a module due to relative import - Update README --- README.md | 85 ++++++++++++++++++++---------- etc/transfer-script.sh | 5 ++ etc/transfers-2.conf | 7 +++ etc/transfers.conf | 7 +++ requirements/base.txt | 1 + transfers/models.py | 14 ++--- transfers/transfer.py | 117 ++++++++++++++++++++++++++--------------- 7 files changed, 158 insertions(+), 78 deletions(-) create mode 100644 etc/transfer-script.sh create mode 100644 etc/transfers-2.conf create mode 100644 etc/transfers.conf diff --git a/README.md b/README.md index f2fa7e16..cdc5b056 100644 --- a/README.md +++ b/README.md @@ -3,37 +3,54 @@ Automation Tools The Automation Tools project is a set of python scripts, that are designed to automate the processing of transfers in an Archivematica pipeline. -Currently, the only automation tool is automate transfers. It is used to prepare transfers, move them into the pipelines processing location, and take actions when user input is required. Only one transfer is sent to the pipeline at a time, the scripts wait until the current transfer is resolved (failed, rejected or stored as an AIP) before automatically starting the next available transfer. +Installation +------------ -The code is available on [Github](http://github.com/artefactual/automation-tools). +* Checkout or link the code in this repo to `/usr/lib/archivematica/automation-tools` +* Create virtualenv `/usr/share/python/automation-tools` and pip install requirements there +* Create directories `/var/log/archivematica/automation-tools` and `/var/archivematica/automation-tools` owned by user `archivematica`, for log/database/PID files. +* Create directory `/etc/archivematica/automation-tools` and add configuration files there. Files in the `etc/` directory of this repository can be used as an example (also see below for more about configuration) -The code is deployed to `/usr/lib/archivematica/automation-tools`. +Automated transfers +------------------- -Deployment ----------- +`transfers/transfer.py` is used to prepare transfers, move them into the pipelines processing location, and take actions when user input is required. +Only one transfer is sent to the pipeline at a time, the scripts wait until the current transfer is resolved (failed, rejected or stored as an AIP) before automatically starting the next available transfer. -Suggested deployment is to use cron to run a shell script that runs the automate transfer tool. Example shell script: +### Configuration - Overview - #!/bin/bash - cd /usr/lib/archivematica/automation-tools/transfers/ - /usr/share/python/automation-tools/bin/python transfer.py --user --api-key --transfer-source --depth 2 +Suggested deployment is to use cron to run a shell script that runs the automate transfer tool. Example shell script (for example in `/etc/archivematica/automation-tools/transfer-script.sh`): -This script is run through a crontab entry. Example: +``` +#!/bin/bash +cd /usr/lib/archivematica/automation-tools/ +/usr/share/python/automation-tools/bin/python -m transfers.transfer --user --api-key --transfer-source --config-file +``` - */5 * * * * /etc/archivematica/automation-tools/transfer-script.sh +(Note that the script calls the transfers script as a module using python's `-m` flag, this is required due to the use of relative imports in the code) -The cron entry executes the `transfer-script.sh` script. This should be run as the same user as Archivematica is run as (likely the `archivematica` user.) +The script can be run from a shell window like: -When running, automate transfers stores its working state in transfers.db, a sqlite database. It contains a record of all the transfers that have been processed. In a testing environment, deleting this file will cause the tools to re-process any and all folders found in the Transfer Source Location. +``` +user@host:/etc/archivematica/automation-tools$ sudo -u archivematica ./transfer-script.sh +``` -Configuration -------------- +It is suggested to run the script through a crontab entry for user archivematica (to avoid the need to repeatedly invoke it manually): -This script can be modified to adjust how automated transfers work. The full set of parameters that can be changed are: +``` +*/5 * * * * /etc/archivematica/automation-tools/transfer-script.sh +``` + +When running, automated transfers stores its working state in a sqlite database. It contains a record of all the transfers that have been processed. In a testing environment, deleting this file will cause the tools to re-process any and all folders found in the Transfer Source Location. + +### Configuration - Parameters + +The `transfers.py` script can be modified to adjust how automated transfers work. The full set of parameters that can be changed are: * `-u USERNAME, --user USERNAME` [REQUIRED]: Username of the dashboard user to authenticate as. * `-k KEY, --api-key KEY` [REQUIRED]: API key of the dashboard user. * `-t UUID, --transfer-source UUID`: [REQUIRED] Transfer Source Location UUID to fetch transfers from. Check the next section for more details on this field. +* `-c FILE, --config-file FILE`: config file containing file paths for log/database/PID files. Default: log/database/PID files stored in the same directory as the script (not recommended for production) * `--transfer-path PATH`: Relative path within the Transfer Source. Default: "" * `--depth DEPTH, -d DEPTH`: Depth to create the transfers from relative to the transfer source location and path. Default of 1 creates transfers from the children of transfer-path. * `--am-url URL, -a URL`:Archivematica URL. Default: http://127.0.0.1 @@ -48,16 +65,13 @@ The easiest way to configure the tasks that automation-tools will run is by usin 1. Go to Administration|Processing Configuration and choose the options you wish to use. -2. Save the configuation on the form. +2. Save the configuration on the form. 3. Copy the processing configuration file from '/var/archivematica/sharedDirectory/sharedMicroServiceTasksConfigs/processingMCPConfigs/defaultProcessingMCP.xml' on the Archivematica host machine to the transfers/ directory of your automation-tools installation location. - The automation-tools command-line also relies on installation-specific UUIDs. To obtain the transfer source UUID for script invocation, visit the 'Transfer Source' tab in the Archivematica Storage Space web dashboard. If a row is marked as a transfer souce its UUID value will be valid as a transfer source argument. - -Hooks ------ +### Hooks During processing, automate transfers will run scripts from several places to customize behaviour. These scripts can be in any language. If they are written in Python, we recommend making them source compatible with python 2 or 3. @@ -73,7 +87,7 @@ There are also several scripts provided for common use cases and examples of pro These are found in the `examples` directory sorted by their usecase and can be copied or symlinked to the appropriate directory for automation-tools to run them. If you write a script that might be useful for others, please make a pull request! -### get-accession-id +#### get-accession-id * _Name:_ `get-accession-id` * _Location:_ Same directory as transfers.py @@ -83,7 +97,7 @@ If you write a script that might be useful for others, please make a pull reques `get-accession-number` is run to customize the accession number of the created transfer. Its single parameter is the path relative to the transfer source location. Note that no files are locally available when `get-accession-id` is run. It should print to standard output the quoted value of the accession number (e.g. `"ID42"`), `None`, or no output. If the return code is not 0, all output is ignored. This is POSTed to the Archivematica REST API when the transfer is created. -### pre-transfer hooks +#### pre-transfer hooks * _Parameters:_ [`absolute path`, `transfer type`] @@ -101,7 +115,7 @@ There are some sample scripts in the pre-transfers directory that may be useful, * `archivesspace_ids.py`: Creates an archivesspaceids.csv by parsing ArchivesSpace reference IDs from filenames. This will automate the matching GUI if a DIP is uploaded to ArchivesSpace. * `default_config.py`: Copies the included `defaultProcessingMCP.xml` into the transfer directory. This file overrides any configuration set in the Archivematica dashboard, so that user choices are guaranteed and avoided as desired. -### user-input +#### user-input * _Parameters:_ [`microservice name`, `first time at wait point`, `absolute path` , `unit UUID`, `unit name`, `unit type`] @@ -118,12 +132,11 @@ All scripts are passed the same set of parameters. There are some sample scripts in the pre-transfers directory that may be useful, or models for your own scripts. -* `send_email.py`: Emails the first time a transfer is waitintg for input at Approve Normalization. It can be edited to change the email addresses it sends notices to, or to change the notification message. +* `send_email.py`: Emails the first time a transfer is waiting for input at Approve Normalization. It can be edited to change the email addresses it sends notices to, or to change the notification message. -Logs ----- +### Logs -Logs are written to the same directory as the `transfers.py` script. The logging level can be adjusted, by modifying the transfers/transfer.py file. Find the following section and changed `'INFO'` to one of `'INFO'`, `'DEBUG'`, `'WARNING'`, `'ERROR'` or `'CRITICAL'`. +Logs are written to a directory specified in the config file (or `/var/log/archivematica/automation-tools/` by default). The logging level can be adjusted, by modifying the transfers/transfer.py file. Find the following section and changed `'INFO'` to one of `'INFO'`, `'DEBUG'`, `'WARNING'`, `'ERROR'` or `'CRITICAL'`. 'loggers': { 'transfer': { @@ -132,6 +145,22 @@ Logs are written to the same directory as the `transfers.py` script. The logging }, }, +### Multiple automated transfer instances + +You may need to set up multiple automated transfer instances, for example if required to ingest both standard transfers and bags. In cases where hooks are the same for both instances, it could be achieved by setting up different scripts, each one invoking the transfers.py script with the required parameters. Example: + +``` +# first script invokes like this (standard transfer): +/usr/share/python/automation-tools/bin/python -m transfers.transfer --user --api-key --transfer-source --config-file + +# second script invokes like this (unzipped bags): +/usr/share/python/automation-tools/bin/python -m transfers.transfer --user --api-key --transfer-source --config-file --transfer-type 'unzipped bag' +``` + +`` and `` should specify different file names for db/PID/log files. See transfers.conf and transfers-2.conf in etc/ for an example + +In case different hooks are required for each instance, a possible approach is to checkout a new instance of the automation tools, for example in `/usr/lib/archivematica/automation-tools-2` + Related Projects ---------------- diff --git a/etc/transfer-script.sh b/etc/transfer-script.sh new file mode 100644 index 00000000..3a798da4 --- /dev/null +++ b/etc/transfer-script.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# transfer script example +# /etc/archivematica/automation-tools/transfer-script.sh +cd /usr/lib/archivematica/automation-tools/ +/usr/share/python/automation-tools/bin/python -m transfers.transfer --user --api-key --transfer-source --config-file diff --git a/etc/transfers-2.conf b/etc/transfers-2.conf new file mode 100644 index 00000000..f6460605 --- /dev/null +++ b/etc/transfers-2.conf @@ -0,0 +1,7 @@ +# automation-tools:transfers configuration file example +# /etc/archivematica/automation-tools/transfers-2.conf + +[transfers] +logfile = /var/log/archivematica/automation-tools/transfers-2.log +databasefile = /var/archivematica/automation-tools/transfers-2.db +pidfile = /var/archivematica/automation-tools/transfers-2-pid.lck diff --git a/etc/transfers.conf b/etc/transfers.conf new file mode 100644 index 00000000..96e74e91 --- /dev/null +++ b/etc/transfers.conf @@ -0,0 +1,7 @@ +# automation-tools:transfers configuration file example +# /etc/archivematica/automation-tools/transfers.conf + +[transfers] +logfile = /var/log/archivematica/automation-tools/transfers.log +databasefile = /var/archivematica/automation-tools/transfers.db +pidfile = /var/archivematica/automation-tools/transfers-pid.lck diff --git a/requirements/base.txt b/requirements/base.txt index b9ae2f1e..d8c34b1f 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,2 +1,3 @@ requests<3.0 sqlalchemy +six diff --git a/transfers/models.py b/transfers/models.py index d1608da3..55b9b9ae 100644 --- a/transfers/models.py +++ b/transfers/models.py @@ -1,17 +1,12 @@ -import os - from sqlalchemy import create_engine from sqlalchemy import Sequence from sqlalchemy import Column, Binary, Boolean, Integer, String from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker -db_path = os.path.join(os.path.dirname(__file__), 'transfers.db') -engine = create_engine('sqlite:///{}'.format(db_path), echo=False) - -Session = sessionmaker(bind=engine) Base = declarative_base() + class Unit(Base): __tablename__ = 'unit' id = Column(Integer, Sequence('user_id_seq'), primary_key=True) @@ -25,4 +20,9 @@ class Unit(Base): def __repr__(self): return "".format(s=self) -Base.metadata.create_all(engine) + +def init(databasefile): + engine = create_engine('sqlite:///{}'.format(databasefile), echo=False) + global Session + Session = sessionmaker(bind=engine) + Base.metadata.create_all(engine) diff --git a/transfers/transfer.py b/transfers/transfer.py index 9e4b5704..f4cc3e0a 100755 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -13,10 +13,13 @@ import logging.config # Has to be imported separately import os import requests +from six.moves import configparser import subprocess import sys import time +from . import models + try: from os import fsencode, fsdecode except ImportError: @@ -41,41 +44,58 @@ def fsdecode(filename): THIS_DIR = os.path.abspath(os.path.dirname(__file__)) sys.path.append(THIS_DIR) -from models import Unit, Session LOGGER = logging.getLogger('transfer') -# Configure logging -CONFIG = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'default': { - 'format': '%(levelname)-8s %(asctime)s %(filename)s:%(lineno)-4s %(message)s', - 'datefmt': '%Y-%m-%d %H:%M:%S', - }, - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'default', + +CONFIG_FILE = None + + +def get_setting(setting, default=None): + config = configparser.SafeConfigParser() + try: + config.read(CONFIG_FILE) + return config.get('transfers', setting) + except Exception: + return default + + +def setup(config_file): + global CONFIG_FILE + CONFIG_FILE = config_file + models.init(get_setting('databasefile', os.path.join(THIS_DIR, 'transfers.db'))) + + # Configure logging + default_logfile = os.path.join(THIS_DIR, 'automate-transfer.log') + CONFIG = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'default': { + 'format': '%(levelname)-8s %(asctime)s %(filename)s:%(lineno)-4s %(message)s', + 'datefmt': '%Y-%m-%d %H:%M:%S', + }, }, - 'file': { - 'class': 'logging.handlers.RotatingFileHandler', - 'formatter': 'default', - 'filename': os.path.join(os.path.abspath(os.path.dirname(__file__)), 'automate-transfer.log'), - 'backupCount': 2, - 'maxBytes': 10 * 1024, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler', + 'formatter': 'default', + }, + 'file': { + 'class': 'logging.handlers.RotatingFileHandler', + 'formatter': 'default', + 'filename': get_setting('logfile', default_logfile), + 'backupCount': 2, + 'maxBytes': 10 * 1024, + }, }, - }, - 'loggers': { - 'transfer': { - 'level': 'INFO', # One of INFO, DEBUG, WARNING, ERROR, CRITICAL - 'handlers': ['console', 'file'], + 'loggers': { + 'transfer': { + 'level': 'INFO', # One of INFO, DEBUG, WARNING, ERROR, CRITICAL + 'handlers': ['console', 'file'], + }, }, - }, -} -logging.config.dictConfig(CONFIG) - + } + logging.config.dictConfig(CONFIG) def _call_url_json(url, params): @@ -126,7 +146,7 @@ def get_status(am_url, user, api_key, unit_uuid, unit_type, session, hide_on_com if unit_info and unit_type == 'transfer' and unit_info['status'] == 'COMPLETE' and unit_info['sip_uuid'] != 'BACKLOG': LOGGER.info('%s is a complete transfer, fetching SIP %s status.', unit_uuid, unit_info['sip_uuid']) # Update DB to refer to this one - db_unit = session.query(Unit).filter_by(unit_type=unit_type, uuid=unit_uuid).one() + db_unit = session.query(models.Unit).filter_by(unit_type=unit_type, uuid=unit_uuid).one() db_unit.unit_type = 'ingest' db_unit.uuid = unit_info['sip_uuid'] # Get SIP status @@ -158,8 +178,8 @@ def get_accession_id(dirname): script_path = os.path.join(THIS_DIR, 'get-accession-number') try: p = subprocess.Popen([script_path, dirname], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except FileNotFoundError: - LOGGER.info('%s does not exist.', script_path) + except Exception: + LOGGER.info('Error when trying to run %s', script_path) return None output, err = p.communicate() if p.returncode != 0: @@ -270,7 +290,7 @@ def start_transfer(ss_url, ts_location_uuid, ts_path, depth, am_url, user_name, :returns: Tuple of Transfer information about the new transfer or None on error. """ # Start new transfer - completed = {x[0] for x in session.query(Unit.path).all()} + completed = {x[0] for x in session.query(models.Unit.path).all()} target = get_next_transfer(ss_url, ts_location_uuid, ts_path, depth, completed, see_files) if not target: LOGGER.warning("All potential transfers in %s have been created. Exiting", ts_path) @@ -301,13 +321,14 @@ def start_transfer(ss_url, ts_location_uuid, ts_path, depth, am_url, user_name, if not response.ok or resp_json.get('error'): LOGGER.error('Unable to start transfer.') LOGGER.error('Response: %s', resp_json) - new_transfer = Unit(path=target, unit_type='transfer', status='FAILED', current=False) + new_transfer = models.Unit(path=target, unit_type='transfer', status='FAILED', current=False) session.add(new_transfer) return None # Run all scripts in pre-transfer directory # TODO what inputs do we want? - run_scripts('pre-transfer', + run_scripts( + 'pre-transfer', resp_json['path'], # Absolute path 'standard', # Transfer type ) @@ -320,14 +341,14 @@ def start_transfer(ss_url, ts_location_uuid, ts_path, depth, am_url, user_name, # Mark as started if result: LOGGER.info('Approved %s', result) - new_transfer = Unit(uuid=result, path=target, unit_type='transfer', current=True) + new_transfer = models.Unit(uuid=result, path=target, unit_type='transfer', current=True) LOGGER.info('New transfer: %s', new_transfer) session.add(new_transfer) break LOGGER.info('Failed approve, try %s of %s', i + 1, retry_count) else: LOGGER.warning('Not approved') - new_transfer = Unit(uuid=None, path=target, unit_type='transfer', current=False) + new_transfer = models.Unit(uuid=None, path=target, unit_type='transfer', current=False) session.add(new_transfer) return None @@ -368,12 +389,18 @@ def approve_transfer(directory_name, url, api_key, user_name): else: return None -def main(user, api_key, ts_uuid, ts_path, depth, am_url, ss_url, transfer_type, see_files, hide_on_complete=False): + +def main(user, api_key, ts_uuid, ts_path, depth, am_url, ss_url, transfer_type, see_files, hide_on_complete=False, config_file=None): + + setup(config_file) + LOGGER.info("Waking up") - session = Session() + + session = models.Session() # Check for evidence that this is already running - pid_file = os.path.join(THIS_DIR, 'pid.lck') + default_pidfile = os.path.join(THIS_DIR, 'pid.lck') + pid_file = get_setting('pidfile', default_pidfile) try: # Open PID file only if it doesn't exist for read/write f = os.fdopen(os.open(pid_file, os.O_CREAT | os.O_EXCL | os.O_RDWR), 'r+') @@ -388,7 +415,7 @@ def main(user, api_key, ts_uuid, ts_path, depth, am_url, ss_url, transfer_type, # Check status of last unit current_unit = None try: - current_unit = session.query(Unit).filter_by(current=True).one() + current_unit = session.query(models.Unit).filter_by(current=True).one() unit_uuid = current_unit.uuid unit_type = current_unit.unit_type except Exception: @@ -418,7 +445,8 @@ def main(user, api_key, ts_uuid, ts_path, depth, am_url, ss_url, transfer_type, LOGGER.info('Waiting on user input, running scripts in user-input directory.') # TODO What inputs do we want? microservice = status_info.get('microservice', '') - run_scripts('user-input', + run_scripts( + 'user-input', microservice, # Current microservice name str(microservice != current_unit.microservice), # String True or False if this is the first time at this wait point status_info['path'], # Absolute path @@ -441,6 +469,7 @@ def main(user, api_key, ts_uuid, ts_path, depth, am_url, ss_url, transfer_type, if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-u', '--user', metavar='USERNAME', required=True, help='Username of the dashboard user to authenticate as.') parser.add_argument('-k', '--api-key', metavar='KEY', required=True, help='API key of the dashboard user.') @@ -452,6 +481,7 @@ def main(user, api_key, ts_uuid, ts_path, depth, am_url, ss_url, transfer_type, parser.add_argument('--transfer-type', metavar='TYPE', help="Type of transfer to start. One of: 'standard' (default), 'unzipped bag', 'zipped bag', 'dspace'.", default='standard', choices=['standard', 'unzipped bag', 'zipped bag', 'dspace']) parser.add_argument('--files', action='store_true', help='If set, start transfers from files as well as folders.') parser.add_argument('--hide', action='store_true', help='If set, hide the Transfers and SIPs in the dashboard once they complete.') + parser.add_argument('-c', '--config-file', metavar='FILE', help='Configuration file(log/db/PID files)', default=None) args = parser.parse_args() sys.exit(main( @@ -465,4 +495,5 @@ def main(user, api_key, ts_uuid, ts_path, depth, am_url, ss_url, transfer_type, transfer_type=args.transfer_type, see_files=args.files, hide_on_complete=args.hide, + config_file=args.config_file, ))