From cb00bad3b9eb3a2b07c6189143f847d33fa318d0 Mon Sep 17 00:00:00 2001 From: Misty De Meo Date: Wed, 3 Dec 2014 18:35:57 -0800 Subject: [PATCH] Initial commit --- characterization/exiftool.sh | 1 + characterization/ffprobe.sh | 1 + characterization/fits.sh | 1 + characterization/fiwalk.sh | 1 + characterization/mediainfo.sh | 1 + event_detail/7z-extraction.sh | 1 + event_detail/convert-normalization.sh | 1 + event_detail/ffmpeg-normalization.sh | 1 + event_detail/ghostscript-normalization.sh | 1 + event_detail/inkscape-normalization.sh | 1 + event_detail/maildir-normalization.sh | 1 + event_detail/ps2pdf-normalization.sh | 1 + event_detail/readpst-normalization.sh | 1 + event_detail/unrar-extraction.sh | 1 + extraction/7z.sh | 1 + extraction/rar.py | 16 ++++ extraction/tsk_recover.py | 30 +++++++ id/fido.py | 61 ++++++++++++++ id/file-by-extension.py | 16 ++++ normalization/access-h264.sh | 1 + normalization/access-jpeg.sh | 3 + normalization/access-mp3.sh | 1 + normalization/default-access.sh | 1 + normalization/default-thumbnail.sh | 1 + normalization/jpeg-thumbnail.sh | 3 + normalization/maildir-to-mbox.sh | 1 + normalization/preservation-ffv1.sh | 16 ++++ .../preservation-pdfa-ghostscript.sh | 1 + normalization/preservation-pdfa-inkscape.sh | 1 + normalization/preservation-pdfa-ps2pdf.sh | 1 + normalization/preservation-svg.sh | 3 + normalization/preservation-tiff.sh | 1 + normalization/preservation-wav.sh | 1 + transcription/ocr.sh | 4 + validation/jhove.py | 84 +++++++++++++++++++ verification/file-exists.sh | 1 + verification/file-has-nonzero-size.sh | 1 + 37 files changed, 263 insertions(+) create mode 100755 characterization/exiftool.sh create mode 100755 characterization/ffprobe.sh create mode 100755 characterization/fits.sh create mode 100755 characterization/fiwalk.sh create mode 100755 characterization/mediainfo.sh create mode 100644 event_detail/7z-extraction.sh create mode 100644 event_detail/convert-normalization.sh create mode 100644 event_detail/ffmpeg-normalization.sh create mode 100644 event_detail/ghostscript-normalization.sh create mode 100644 event_detail/inkscape-normalization.sh create mode 100644 event_detail/maildir-normalization.sh create mode 100644 event_detail/ps2pdf-normalization.sh create mode 100644 event_detail/readpst-normalization.sh create mode 100644 event_detail/unrar-extraction.sh create mode 100644 extraction/7z.sh create mode 100644 extraction/rar.py create mode 100644 extraction/tsk_recover.py create mode 100644 id/fido.py create mode 100644 id/file-by-extension.py create mode 100644 normalization/access-h264.sh create mode 100644 normalization/access-jpeg.sh create mode 100644 normalization/access-mp3.sh create mode 100644 normalization/default-access.sh create mode 100644 normalization/default-thumbnail.sh create mode 100644 normalization/jpeg-thumbnail.sh create mode 100644 normalization/maildir-to-mbox.sh create mode 100644 normalization/preservation-ffv1.sh create mode 100644 normalization/preservation-pdfa-ghostscript.sh create mode 100644 normalization/preservation-pdfa-inkscape.sh create mode 100644 normalization/preservation-pdfa-ps2pdf.sh create mode 100644 normalization/preservation-svg.sh create mode 100644 normalization/preservation-tiff.sh create mode 100644 normalization/preservation-wav.sh create mode 100644 transcription/ocr.sh create mode 100644 validation/jhove.py create mode 100644 verification/file-exists.sh create mode 100644 verification/file-has-nonzero-size.sh diff --git a/characterization/exiftool.sh b/characterization/exiftool.sh new file mode 100755 index 0000000..830b82c --- /dev/null +++ b/characterization/exiftool.sh @@ -0,0 +1 @@ +exiftool -X "%fileFullName%" diff --git a/characterization/ffprobe.sh b/characterization/ffprobe.sh new file mode 100755 index 0000000..fb34f8f --- /dev/null +++ b/characterization/ffprobe.sh @@ -0,0 +1 @@ +ffprobe -i "%fileFullName%" -show_data -show_format -show_error -show_streams -show_chapters -show_private_data -show_versions -print_format xml diff --git a/characterization/fits.sh b/characterization/fits.sh new file mode 100755 index 0000000..b1ba43e --- /dev/null +++ b/characterization/fits.sh @@ -0,0 +1 @@ +ng edu.harvard.hul.ois.fits.Fits -i %relativeLocation% diff --git a/characterization/fiwalk.sh b/characterization/fiwalk.sh new file mode 100755 index 0000000..d0a2ab5 --- /dev/null +++ b/characterization/fiwalk.sh @@ -0,0 +1 @@ +fiwalk -x %relativeLocation% -c /usr/lib/archivematica/archivematicaCommon/externals/fiwalk_plugins/ficonfig.txt diff --git a/characterization/mediainfo.sh b/characterization/mediainfo.sh new file mode 100755 index 0000000..bbed01e --- /dev/null +++ b/characterization/mediainfo.sh @@ -0,0 +1 @@ +mediainfo --Language=Raw -f --Output=XML "%fileFullName%" diff --git a/event_detail/7z-extraction.sh b/event_detail/7z-extraction.sh new file mode 100644 index 0000000..e444993 --- /dev/null +++ b/event_detail/7z-extraction.sh @@ -0,0 +1 @@ +echo program=\"7z\"\; version=\"`7z | grep Version`\" diff --git a/event_detail/convert-normalization.sh b/event_detail/convert-normalization.sh new file mode 100644 index 0000000..5e40980 --- /dev/null +++ b/event_detail/convert-normalization.sh @@ -0,0 +1 @@ +echo program=\"convert\"\; version=\"`convert -version | grep Version:`\" diff --git a/event_detail/ffmpeg-normalization.sh b/event_detail/ffmpeg-normalization.sh new file mode 100644 index 0000000..ec85e92 --- /dev/null +++ b/event_detail/ffmpeg-normalization.sh @@ -0,0 +1 @@ +echo program=\"ffmpeg\"\; version=\"`ffmpeg 2>&1 | grep --ignore-case "FFmpeg version"`\" diff --git a/event_detail/ghostscript-normalization.sh b/event_detail/ghostscript-normalization.sh new file mode 100644 index 0000000..3cc2fff --- /dev/null +++ b/event_detail/ghostscript-normalization.sh @@ -0,0 +1 @@ +echo program=\"Ghostscript\"\; version=\"`gs --version`\" diff --git a/event_detail/inkscape-normalization.sh b/event_detail/inkscape-normalization.sh new file mode 100644 index 0000000..a4d1448 --- /dev/null +++ b/event_detail/inkscape-normalization.sh @@ -0,0 +1 @@ +echo program=\"inkscape\"\; version=\"`inkscape -V`\" diff --git a/event_detail/maildir-normalization.sh b/event_detail/maildir-normalization.sh new file mode 100644 index 0000000..3d9f516 --- /dev/null +++ b/event_detail/maildir-normalization.sh @@ -0,0 +1 @@ +echo "/usr/lib/archivematica/transcoder/transcoderScripts/" "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.mbox" diff --git a/event_detail/ps2pdf-normalization.sh b/event_detail/ps2pdf-normalization.sh new file mode 100644 index 0000000..11cf765 --- /dev/null +++ b/event_detail/ps2pdf-normalization.sh @@ -0,0 +1 @@ +echo program=\"ps2pdf\"\; program=\"Ghostscript\"\; version=\"`gs --version`\" diff --git a/event_detail/readpst-normalization.sh b/event_detail/readpst-normalization.sh new file mode 100644 index 0000000..87ab9e0 --- /dev/null +++ b/event_detail/readpst-normalization.sh @@ -0,0 +1 @@ +echo program=\"readpst\"\; version=\"`readpst -V`\" diff --git a/event_detail/unrar-extraction.sh b/event_detail/unrar-extraction.sh new file mode 100644 index 0000000..31d255b --- /dev/null +++ b/event_detail/unrar-extraction.sh @@ -0,0 +1 @@ +echo program=\"unrar-nonfree\"\; version=\"`unrar-nonfree | grep 'UNRAR'`\" diff --git a/extraction/7z.sh b/extraction/7z.sh new file mode 100644 index 0000000..1712b00 --- /dev/null +++ b/extraction/7z.sh @@ -0,0 +1 @@ +7z x -bd -o"%outputDirectory%" "%inputFile%" diff --git a/extraction/rar.py b/extraction/rar.py new file mode 100644 index 0000000..17dc4b6 --- /dev/null +++ b/extraction/rar.py @@ -0,0 +1,16 @@ +import os, subprocess, sys + +def main(output_directory, compressed_file): + # Note that unrar-free only extracts into the current working directory, + # hence the os.chdir() here + try: + os.chdir(output_directory) + args = ['unrar', '-x', compressed_file] + subprocess.call(args) + except Exception as e: + return e + +if __name__ == '__main__': + output_directory = sys.argv[1] + compressed_file = sys.argv[2] + exit(main(output_directory, compressed_file)) diff --git a/extraction/tsk_recover.py b/extraction/tsk_recover.py new file mode 100644 index 0000000..ee265ac --- /dev/null +++ b/extraction/tsk_recover.py @@ -0,0 +1,30 @@ +from __future__ import print_function +import re +import subprocess +import sys + +def extract(package, outdir): + # -a extracts only allocated files; we're not capturing unallocated files + try: + process = subprocess.Popen(['tsk_recover', package, '-a', outdir], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, stderr = process.communicate() + + match = re.match(r'Files Recovered: (\d+)', stdout.splitlines()[0]) + if match: + if match.groups()[0] == '0': + raise Exception('tsk_recover failed to extract any files with the message: {}'.format(stdout)) + else: + print(stdout) + except Exception as e: + return e + + return 0 + +def main(package, outdir): + return extract(package, outdir) + +if __name__ == '__main__': + package = sys.argv[1] + outdir = sys.argv[2] + sys.exit(main(package, outdir)) diff --git a/id/fido.py b/id/fido.py new file mode 100644 index 0000000..455d17f --- /dev/null +++ b/id/fido.py @@ -0,0 +1,61 @@ +import os.path +import re +import subprocess +import sys + +def file_tool(path): + return subprocess.check_output(['file', path]).strip() + +class FidoFailed(Exception): + def __init__(self, stdout, stderr, retcode): + message = """ +Fido exited {retcode} and no format was found. +stdout: {stdout} +--- +stderr: {stderr} +""".format(stdout=stdout, stderr=stderr, retcode=retcode) + super(FidoFailed, self).__init__(message) + +def identify(file_): + # The default buffer size fido uses, 256KB, is too small to be able to detect certain formats + # Formats like office documents and Adobe Illustrator .ai files will be identified as other, less-specific formats + # This larger buffer size is a bit slower and consumes more RAM, so some users may wish to customize this to reduce the buffer size + # See: https://projects.artefactual.com/issues/5941, https://projects.artefactual.com/issues/5731 + cmd = ['fido', '-bufsize', '1048576', + '-loadformats', '/usr/lib/archivematica/archivematicaCommon/externals/fido/archivematica_format_extensions.xml', + os.path.abspath(file_)] + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, stderr = process.communicate() + + try: + results = stdout.split('\n')[0].split(',') + except: + raise FidoFailed(stdout, stderr, process.returncode) + + if process.returncode != 0 or results[-1] == '"fail"': + raise FidoFailed(stdout, stderr, process.returncode) + else: + puid = results[2] + if re.match('(.+)?fmt\/\d+', puid): + return puid + else: + print >> sys.stderr, "File identified as non-standard Fido code: {id}".format(id=puid) + return "" + +def main(argv): + try: + print identify(argv[1]) + return 0 + except FidoFailed as e: + file_output = file_tool(argv[1]) + # FIDO can't currently identify text files with no extension, and this + # is a common enough usecase to special-case it + if 'text' in file_output: + print 'x-fmt/111' + else: + return e + except Exception as e: + return e + +if __name__ == '__main__': + exit(main(sys.argv)) diff --git a/id/file-by-extension.py b/id/file-by-extension.py new file mode 100644 index 0000000..bc57f45 --- /dev/null +++ b/id/file-by-extension.py @@ -0,0 +1,16 @@ +import os.path +import subprocess +import sys + +def file_tool(path): + return subprocess.check_output(['file', path]).strip() + +(_, extension) = os.path.splitext(sys.argv[1]) +if extension: + print extension.lower() +else: + # Plaintext files frequently have no extension, but are common to identify. + # file is pretty smart at figuring these out. + file_output = file_tool(sys.argv[1]) + if 'text' in file_output: + print '.txt' diff --git a/normalization/access-h264.sh b/normalization/access-h264.sh new file mode 100644 index 0000000..0c4bbda --- /dev/null +++ b/normalization/access-h264.sh @@ -0,0 +1 @@ +ffmpeg -i "%fileFullName%" -vcodec libx264 -pix_fmt yuv420p -preset medium -crf 18 "%outputDirectory%%prefix%%fileName%%postfix%.mp4" diff --git a/normalization/access-jpeg.sh b/normalization/access-jpeg.sh new file mode 100644 index 0000000..674e770 --- /dev/null +++ b/normalization/access-jpeg.sh @@ -0,0 +1,3 @@ +convert "%fileFullName%" -sampling-factor 4:4:4 -quality 60 -layers merge + "%outputDirectory%%prefix%%fileName%%postfix%.jpg" + \ No newline at end of file diff --git a/normalization/access-mp3.sh b/normalization/access-mp3.sh new file mode 100644 index 0000000..4721fd5 --- /dev/null +++ b/normalization/access-mp3.sh @@ -0,0 +1 @@ +ffmpeg -i "%fileFullName%" -ac 2 -ab 192000 "%outputDirectory%%prefix%%fileName%%postfix%.mp3" diff --git a/normalization/default-access.sh b/normalization/default-access.sh new file mode 100644 index 0000000..7c02754 --- /dev/null +++ b/normalization/default-access.sh @@ -0,0 +1 @@ +cp -R "%inputFile%" "%outputDirectory%%prefix%%fileName%%postfix%%fileExtensionWithDot%" diff --git a/normalization/default-thumbnail.sh b/normalization/default-thumbnail.sh new file mode 100644 index 0000000..9f186bc --- /dev/null +++ b/normalization/default-thumbnail.sh @@ -0,0 +1 @@ +cp -R "/var/archivematica/sharedDirectory/sharedMicroServiceTasksConfigs/transcoder/defaultIcons/default.jpg" "%outputDirectory%%postfix%.jpg" diff --git a/normalization/jpeg-thumbnail.sh b/normalization/jpeg-thumbnail.sh new file mode 100644 index 0000000..5dce0a0 --- /dev/null +++ b/normalization/jpeg-thumbnail.sh @@ -0,0 +1,3 @@ +convert "%fileFullName%" -thumbnail 100x100 -layers merge + "%outputDirectory%%postfix%.jpg" + \ No newline at end of file diff --git a/normalization/maildir-to-mbox.sh b/normalization/maildir-to-mbox.sh new file mode 100644 index 0000000..e644c30 --- /dev/null +++ b/normalization/maildir-to-mbox.sh @@ -0,0 +1 @@ +"/usr/lib/archivematica/MCPClient/clientScripts/archivematicaMaildirToMbox.py" "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.mbox" diff --git a/normalization/preservation-ffv1.sh b/normalization/preservation-ffv1.sh new file mode 100644 index 0000000..e1ef41f --- /dev/null +++ b/normalization/preservation-ffv1.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +inputFile="%fileFullName%" +outputFile="%outputDirectory%%prefix%%fileName%%postfix%.mkv" +audioCodec="pcm_s16le" +videoCodec="ffv1 -level 3" + +command="ffmpeg -vsync passthrough -i \"${inputFile}\" " +command="${command} -vcodec ${videoCodec} -g 1 " +command="${command} -acodec ${audioCodec}" + + +command="${command} ${outputFile}" + +echo $command +eval $command diff --git a/normalization/preservation-pdfa-ghostscript.sh b/normalization/preservation-pdfa-ghostscript.sh new file mode 100644 index 0000000..999b5ed --- /dev/null +++ b/normalization/preservation-pdfa-ghostscript.sh @@ -0,0 +1 @@ +gs -dPDFA -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -dPDFACompatibilityPolicy=1 -sOutputFile="%outputDirectory%%prefix%%fileName%%postfix%.pdf" "%fileFullName%" diff --git a/normalization/preservation-pdfa-inkscape.sh b/normalization/preservation-pdfa-inkscape.sh new file mode 100644 index 0000000..6f625cd --- /dev/null +++ b/normalization/preservation-pdfa-inkscape.sh @@ -0,0 +1 @@ +inkscape -z "%fileFullName%" --export-pdf="%outputDirectory%%prefix%%fileName%%postfix%.pdf" diff --git a/normalization/preservation-pdfa-ps2pdf.sh b/normalization/preservation-pdfa-ps2pdf.sh new file mode 100644 index 0000000..d499388 --- /dev/null +++ b/normalization/preservation-pdfa-ps2pdf.sh @@ -0,0 +1 @@ +ps2pdf -dEPSCrop -dPDFA "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.pdf" diff --git a/normalization/preservation-svg.sh b/normalization/preservation-svg.sh new file mode 100644 index 0000000..c2c2642 --- /dev/null +++ b/normalization/preservation-svg.sh @@ -0,0 +1,3 @@ +sudo /usr/bin/inkscape "%fileFullName%" --export-plain-svg="%outputDirectory%%prefix%%fileName%%postfix%.svg" + sudo chmod 777 "%outputDirectory%%prefix%%fileName%%postfix%.svg" + \ No newline at end of file diff --git a/normalization/preservation-tiff.sh b/normalization/preservation-tiff.sh new file mode 100644 index 0000000..e9b6a7f --- /dev/null +++ b/normalization/preservation-tiff.sh @@ -0,0 +1 @@ +convert "%fileFullName%" +compress "%outputDirectory%%prefix%%fileName%%postfix%.tif" diff --git a/normalization/preservation-wav.sh b/normalization/preservation-wav.sh new file mode 100644 index 0000000..1713431 --- /dev/null +++ b/normalization/preservation-wav.sh @@ -0,0 +1 @@ +ffmpeg -i "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.wav" diff --git a/transcription/ocr.sh b/transcription/ocr.sh new file mode 100644 index 0000000..9831706 --- /dev/null +++ b/transcription/ocr.sh @@ -0,0 +1,4 @@ +ocrfiles="%SIPObjectsDirectory%metadata/OCRfiles" +test -d "$ocrfiles" || mkdir -p "$ocrfiles" + +tesseract %fileFullName% "$ocrfiles/%fileName%" diff --git a/validation/jhove.py b/validation/jhove.py new file mode 100644 index 0000000..2979133 --- /dev/null +++ b/validation/jhove.py @@ -0,0 +1,84 @@ +import json +import subprocess +import sys + +from lxml import etree + +class JhoveException(Exception): + pass + +def parse_jhove_data(target): + args = ['jhove', '-h', 'xml', target] + try: + output = subprocess.check_output(args) + except subprocess.CalledProcessError: + raise JhoveException("Jhove failed when running: " + ' '.join(args)) + + return etree.fromstring(output) + +def get_status(doc): + status = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}status') + if status is None: + raise JhoveException("Unable to find status!") + + return status.text + +def get_outcome(status, format=None): + # JHOVE returns "bytestream" for unrecognized file formats. + # That can include unrecognized or malformed PDFs, JPEG2000s, etc. + # Since we're whitelisting the formats we're passing in, + # "bytestream" indicates that the format is not in fact well-formed + # regardless of what the status reads. + if format == "bytestream": + return "fail" + + if status == "Well-Formed and valid": + return "pass" + elif status == "Well-Formed, but not valid": + return "partial pass" + else: + return "fail" + +def get_format(doc): + format = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}format') + version = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}version') + + if format is None: + format = "Not detected" + else: + format = format.text + + if version is not None: + version = version.text + + return (format, version) + +def format_event_outcome_detail_note(format, version, result): + note = 'format="{}";'.format(format) + if version is not None: + note = note + ' version="{}";'.format(version) + note = note + ' result="{}"'.format(result) + + return note + +def main(target): + try: + doc = parse_jhove_data(target) + status = get_status(doc) + format, version = get_format(doc) + outcome = get_outcome(status, format) + note = format_event_outcome_detail_note(format, version, status) + + out = { + "eventOutcomeInformation": outcome, + "eventOutcomeDetailNote": note + } + print json.dumps(out) + + return 0 + except JhoveException as e: + return e + +if __name__ == '__main__': + target = sys.argv[1] + sys.exit(main(target)) diff --git a/verification/file-exists.sh b/verification/file-exists.sh new file mode 100644 index 0000000..ac96637 --- /dev/null +++ b/verification/file-exists.sh @@ -0,0 +1 @@ +test -f "%outputLocation%" diff --git a/verification/file-has-nonzero-size.sh b/verification/file-has-nonzero-size.sh new file mode 100644 index 0000000..f59a888 --- /dev/null +++ b/verification/file-has-nonzero-size.sh @@ -0,0 +1 @@ +test -s "%outputLocation%"