From cb00bad3b9eb3a2b07c6189143f847d33fa318d0 Mon Sep 17 00:00:00 2001
From: Misty De Meo <mistydemeo@gmail.com>
Date: Wed, 3 Dec 2014 18:35:57 -0800
Subject: [PATCH] Initial commit

---
 characterization/exiftool.sh                  |  1 +
 characterization/ffprobe.sh                   |  1 +
 characterization/fits.sh                      |  1 +
 characterization/fiwalk.sh                    |  1 +
 characterization/mediainfo.sh                 |  1 +
 event_detail/7z-extraction.sh                 |  1 +
 event_detail/convert-normalization.sh         |  1 +
 event_detail/ffmpeg-normalization.sh          |  1 +
 event_detail/ghostscript-normalization.sh     |  1 +
 event_detail/inkscape-normalization.sh        |  1 +
 event_detail/maildir-normalization.sh         |  1 +
 event_detail/ps2pdf-normalization.sh          |  1 +
 event_detail/readpst-normalization.sh         |  1 +
 event_detail/unrar-extraction.sh              |  1 +
 extraction/7z.sh                              |  1 +
 extraction/rar.py                             | 16 ++++
 extraction/tsk_recover.py                     | 30 +++++++
 id/fido.py                                    | 61 ++++++++++++++
 id/file-by-extension.py                       | 16 ++++
 normalization/access-h264.sh                  |  1 +
 normalization/access-jpeg.sh                  |  3 +
 normalization/access-mp3.sh                   |  1 +
 normalization/default-access.sh               |  1 +
 normalization/default-thumbnail.sh            |  1 +
 normalization/jpeg-thumbnail.sh               |  3 +
 normalization/maildir-to-mbox.sh              |  1 +
 normalization/preservation-ffv1.sh            | 16 ++++
 .../preservation-pdfa-ghostscript.sh          |  1 +
 normalization/preservation-pdfa-inkscape.sh   |  1 +
 normalization/preservation-pdfa-ps2pdf.sh     |  1 +
 normalization/preservation-svg.sh             |  3 +
 normalization/preservation-tiff.sh            |  1 +
 normalization/preservation-wav.sh             |  1 +
 transcription/ocr.sh                          |  4 +
 validation/jhove.py                           | 84 +++++++++++++++++++
 verification/file-exists.sh                   |  1 +
 verification/file-has-nonzero-size.sh         |  1 +
 37 files changed, 263 insertions(+)
 create mode 100755 characterization/exiftool.sh
 create mode 100755 characterization/ffprobe.sh
 create mode 100755 characterization/fits.sh
 create mode 100755 characterization/fiwalk.sh
 create mode 100755 characterization/mediainfo.sh
 create mode 100644 event_detail/7z-extraction.sh
 create mode 100644 event_detail/convert-normalization.sh
 create mode 100644 event_detail/ffmpeg-normalization.sh
 create mode 100644 event_detail/ghostscript-normalization.sh
 create mode 100644 event_detail/inkscape-normalization.sh
 create mode 100644 event_detail/maildir-normalization.sh
 create mode 100644 event_detail/ps2pdf-normalization.sh
 create mode 100644 event_detail/readpst-normalization.sh
 create mode 100644 event_detail/unrar-extraction.sh
 create mode 100644 extraction/7z.sh
 create mode 100644 extraction/rar.py
 create mode 100644 extraction/tsk_recover.py
 create mode 100644 id/fido.py
 create mode 100644 id/file-by-extension.py
 create mode 100644 normalization/access-h264.sh
 create mode 100644 normalization/access-jpeg.sh
 create mode 100644 normalization/access-mp3.sh
 create mode 100644 normalization/default-access.sh
 create mode 100644 normalization/default-thumbnail.sh
 create mode 100644 normalization/jpeg-thumbnail.sh
 create mode 100644 normalization/maildir-to-mbox.sh
 create mode 100644 normalization/preservation-ffv1.sh
 create mode 100644 normalization/preservation-pdfa-ghostscript.sh
 create mode 100644 normalization/preservation-pdfa-inkscape.sh
 create mode 100644 normalization/preservation-pdfa-ps2pdf.sh
 create mode 100644 normalization/preservation-svg.sh
 create mode 100644 normalization/preservation-tiff.sh
 create mode 100644 normalization/preservation-wav.sh
 create mode 100644 transcription/ocr.sh
 create mode 100644 validation/jhove.py
 create mode 100644 verification/file-exists.sh
 create mode 100644 verification/file-has-nonzero-size.sh

diff --git a/characterization/exiftool.sh b/characterization/exiftool.sh
new file mode 100755
index 0000000..830b82c
--- /dev/null
+++ b/characterization/exiftool.sh
@@ -0,0 +1 @@
+exiftool -X "%fileFullName%"
diff --git a/characterization/ffprobe.sh b/characterization/ffprobe.sh
new file mode 100755
index 0000000..fb34f8f
--- /dev/null
+++ b/characterization/ffprobe.sh
@@ -0,0 +1 @@
+ffprobe -i "%fileFullName%" -show_data -show_format -show_error -show_streams -show_chapters -show_private_data -show_versions -print_format xml
diff --git a/characterization/fits.sh b/characterization/fits.sh
new file mode 100755
index 0000000..b1ba43e
--- /dev/null
+++ b/characterization/fits.sh
@@ -0,0 +1 @@
+ng edu.harvard.hul.ois.fits.Fits -i %relativeLocation%
diff --git a/characterization/fiwalk.sh b/characterization/fiwalk.sh
new file mode 100755
index 0000000..d0a2ab5
--- /dev/null
+++ b/characterization/fiwalk.sh
@@ -0,0 +1 @@
+fiwalk -x %relativeLocation% -c /usr/lib/archivematica/archivematicaCommon/externals/fiwalk_plugins/ficonfig.txt
diff --git a/characterization/mediainfo.sh b/characterization/mediainfo.sh
new file mode 100755
index 0000000..bbed01e
--- /dev/null
+++ b/characterization/mediainfo.sh
@@ -0,0 +1 @@
+mediainfo --Language=Raw -f --Output=XML "%fileFullName%"
diff --git a/event_detail/7z-extraction.sh b/event_detail/7z-extraction.sh
new file mode 100644
index 0000000..e444993
--- /dev/null
+++ b/event_detail/7z-extraction.sh
@@ -0,0 +1 @@
+echo program=\"7z\"\; version=\"`7z | grep Version`\"
diff --git a/event_detail/convert-normalization.sh b/event_detail/convert-normalization.sh
new file mode 100644
index 0000000..5e40980
--- /dev/null
+++ b/event_detail/convert-normalization.sh
@@ -0,0 +1 @@
+echo program=\"convert\"\; version=\"`convert -version | grep Version:`\"
diff --git a/event_detail/ffmpeg-normalization.sh b/event_detail/ffmpeg-normalization.sh
new file mode 100644
index 0000000..ec85e92
--- /dev/null
+++ b/event_detail/ffmpeg-normalization.sh
@@ -0,0 +1 @@
+echo program=\"ffmpeg\"\; version=\"`ffmpeg 2>&1 | grep --ignore-case "FFmpeg version"`\"
diff --git a/event_detail/ghostscript-normalization.sh b/event_detail/ghostscript-normalization.sh
new file mode 100644
index 0000000..3cc2fff
--- /dev/null
+++ b/event_detail/ghostscript-normalization.sh
@@ -0,0 +1 @@
+echo program=\"Ghostscript\"\; version=\"`gs --version`\"
diff --git a/event_detail/inkscape-normalization.sh b/event_detail/inkscape-normalization.sh
new file mode 100644
index 0000000..a4d1448
--- /dev/null
+++ b/event_detail/inkscape-normalization.sh
@@ -0,0 +1 @@
+echo program=\"inkscape\"\; version=\"`inkscape -V`\"
diff --git a/event_detail/maildir-normalization.sh b/event_detail/maildir-normalization.sh
new file mode 100644
index 0000000..3d9f516
--- /dev/null
+++ b/event_detail/maildir-normalization.sh
@@ -0,0 +1 @@
+echo "/usr/lib/archivematica/transcoder/transcoderScripts/" "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.mbox"
diff --git a/event_detail/ps2pdf-normalization.sh b/event_detail/ps2pdf-normalization.sh
new file mode 100644
index 0000000..11cf765
--- /dev/null
+++ b/event_detail/ps2pdf-normalization.sh
@@ -0,0 +1 @@
+echo program=\"ps2pdf\"\; program=\"Ghostscript\"\; version=\"`gs --version`\"
diff --git a/event_detail/readpst-normalization.sh b/event_detail/readpst-normalization.sh
new file mode 100644
index 0000000..87ab9e0
--- /dev/null
+++ b/event_detail/readpst-normalization.sh
@@ -0,0 +1 @@
+echo program=\"readpst\"\; version=\"`readpst -V`\"
diff --git a/event_detail/unrar-extraction.sh b/event_detail/unrar-extraction.sh
new file mode 100644
index 0000000..31d255b
--- /dev/null
+++ b/event_detail/unrar-extraction.sh
@@ -0,0 +1 @@
+echo program=\"unrar-nonfree\"\; version=\"`unrar-nonfree | grep 'UNRAR'`\"
diff --git a/extraction/7z.sh b/extraction/7z.sh
new file mode 100644
index 0000000..1712b00
--- /dev/null
+++ b/extraction/7z.sh
@@ -0,0 +1 @@
+7z x -bd -o"%outputDirectory%" "%inputFile%"
diff --git a/extraction/rar.py b/extraction/rar.py
new file mode 100644
index 0000000..17dc4b6
--- /dev/null
+++ b/extraction/rar.py
@@ -0,0 +1,16 @@
+import os, subprocess, sys
+
+def main(output_directory, compressed_file):
+    # Note that unrar-free only extracts into the current working directory,
+    # hence the os.chdir() here
+    try:
+        os.chdir(output_directory)
+        args = ['unrar', '-x', compressed_file]
+        subprocess.call(args)
+    except Exception as e:
+        return e
+
+if __name__ == '__main__':
+    output_directory = sys.argv[1]
+    compressed_file = sys.argv[2]
+    exit(main(output_directory, compressed_file))
diff --git a/extraction/tsk_recover.py b/extraction/tsk_recover.py
new file mode 100644
index 0000000..ee265ac
--- /dev/null
+++ b/extraction/tsk_recover.py
@@ -0,0 +1,30 @@
+from __future__ import print_function
+import re
+import subprocess
+import sys
+
+def extract(package, outdir):
+    # -a extracts only allocated files; we're not capturing unallocated files
+    try:
+        process = subprocess.Popen(['tsk_recover', package, '-a', outdir],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+
+        match = re.match(r'Files Recovered: (\d+)', stdout.splitlines()[0])
+        if match:
+            if match.groups()[0] == '0':
+                raise Exception('tsk_recover failed to extract any files with the message: {}'.format(stdout))
+            else:
+                print(stdout)
+    except Exception as e:
+        return e
+
+    return 0
+
+def main(package, outdir):
+    return extract(package, outdir)
+
+if __name__ == '__main__':
+    package = sys.argv[1]
+    outdir = sys.argv[2]
+    sys.exit(main(package, outdir))
diff --git a/id/fido.py b/id/fido.py
new file mode 100644
index 0000000..455d17f
--- /dev/null
+++ b/id/fido.py
@@ -0,0 +1,61 @@
+import os.path
+import re
+import subprocess
+import sys
+
+def file_tool(path):
+    return subprocess.check_output(['file', path]).strip()
+
+class FidoFailed(Exception):
+    def __init__(self, stdout, stderr, retcode):
+        message = """ 
+Fido exited {retcode} and no format was found.
+stdout: {stdout}
+---
+stderr: {stderr}
+""".format(stdout=stdout, stderr=stderr, retcode=retcode)
+        super(FidoFailed, self).__init__(message)
+
+def identify(file_):
+    # The default buffer size fido uses, 256KB, is too small to be able to detect certain formats
+    # Formats like office documents and Adobe Illustrator .ai files will be identified as other, less-specific formats
+    # This larger buffer size is a bit slower and consumes more RAM, so some users may wish to customize this to reduce the buffer size
+    # See: https://projects.artefactual.com/issues/5941, https://projects.artefactual.com/issues/5731
+    cmd = ['fido', '-bufsize', '1048576',
+           '-loadformats', '/usr/lib/archivematica/archivematicaCommon/externals/fido/archivematica_format_extensions.xml',
+           os.path.abspath(file_)]
+    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+    stdout, stderr = process.communicate()
+
+    try:
+        results = stdout.split('\n')[0].split(',')
+    except:
+        raise FidoFailed(stdout, stderr, process.returncode)
+
+    if process.returncode != 0 or results[-1] == '"fail"':
+        raise FidoFailed(stdout, stderr, process.returncode)
+    else:
+        puid = results[2]
+        if re.match('(.+)?fmt\/\d+', puid):
+            return puid
+        else:
+            print >> sys.stderr, "File identified as non-standard Fido code: {id}".format(id=puid)
+            return "" 
+
+def main(argv):
+    try:
+        print identify(argv[1])
+        return 0
+    except FidoFailed as e:
+        file_output = file_tool(argv[1])
+        # FIDO can't currently identify text files with no extension, and this
+        # is a common enough usecase to special-case it
+        if 'text' in file_output:
+            print 'x-fmt/111'
+        else:
+            return e
+    except Exception as e:
+        return e
+
+if __name__ == '__main__':
+    exit(main(sys.argv))
diff --git a/id/file-by-extension.py b/id/file-by-extension.py
new file mode 100644
index 0000000..bc57f45
--- /dev/null
+++ b/id/file-by-extension.py
@@ -0,0 +1,16 @@
+import os.path
+import subprocess
+import sys
+
+def file_tool(path):
+    return subprocess.check_output(['file', path]).strip()
+
+(_, extension) = os.path.splitext(sys.argv[1])
+if extension:
+    print extension.lower()
+else:
+    # Plaintext files frequently have no extension, but are common to identify.
+    # file is pretty smart at figuring these out.
+    file_output = file_tool(sys.argv[1])
+    if 'text' in file_output:
+        print '.txt'
diff --git a/normalization/access-h264.sh b/normalization/access-h264.sh
new file mode 100644
index 0000000..0c4bbda
--- /dev/null
+++ b/normalization/access-h264.sh
@@ -0,0 +1 @@
+ffmpeg -i "%fileFullName%" -vcodec libx264 -pix_fmt yuv420p -preset medium -crf 18 "%outputDirectory%%prefix%%fileName%%postfix%.mp4"
diff --git a/normalization/access-jpeg.sh b/normalization/access-jpeg.sh
new file mode 100644
index 0000000..674e770
--- /dev/null
+++ b/normalization/access-jpeg.sh
@@ -0,0 +1,3 @@
+convert "%fileFullName%" -sampling-factor 4:4:4 -quality 60 -layers merge
+ "%outputDirectory%%prefix%%fileName%%postfix%.jpg"
+ 
\ No newline at end of file
diff --git a/normalization/access-mp3.sh b/normalization/access-mp3.sh
new file mode 100644
index 0000000..4721fd5
--- /dev/null
+++ b/normalization/access-mp3.sh
@@ -0,0 +1 @@
+ffmpeg -i "%fileFullName%" -ac 2 -ab 192000 "%outputDirectory%%prefix%%fileName%%postfix%.mp3"
diff --git a/normalization/default-access.sh b/normalization/default-access.sh
new file mode 100644
index 0000000..7c02754
--- /dev/null
+++ b/normalization/default-access.sh
@@ -0,0 +1 @@
+cp -R "%inputFile%" "%outputDirectory%%prefix%%fileName%%postfix%%fileExtensionWithDot%"
diff --git a/normalization/default-thumbnail.sh b/normalization/default-thumbnail.sh
new file mode 100644
index 0000000..9f186bc
--- /dev/null
+++ b/normalization/default-thumbnail.sh
@@ -0,0 +1 @@
+cp -R "/var/archivematica/sharedDirectory/sharedMicroServiceTasksConfigs/transcoder/defaultIcons/default.jpg" "%outputDirectory%%postfix%.jpg"
diff --git a/normalization/jpeg-thumbnail.sh b/normalization/jpeg-thumbnail.sh
new file mode 100644
index 0000000..5dce0a0
--- /dev/null
+++ b/normalization/jpeg-thumbnail.sh
@@ -0,0 +1,3 @@
+convert "%fileFullName%" -thumbnail 100x100 -layers merge
+ "%outputDirectory%%postfix%.jpg"
+ 
\ No newline at end of file
diff --git a/normalization/maildir-to-mbox.sh b/normalization/maildir-to-mbox.sh
new file mode 100644
index 0000000..e644c30
--- /dev/null
+++ b/normalization/maildir-to-mbox.sh
@@ -0,0 +1 @@
+"/usr/lib/archivematica/MCPClient/clientScripts/archivematicaMaildirToMbox.py" "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.mbox"
diff --git a/normalization/preservation-ffv1.sh b/normalization/preservation-ffv1.sh
new file mode 100644
index 0000000..e1ef41f
--- /dev/null
+++ b/normalization/preservation-ffv1.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+inputFile="%fileFullName%"
+outputFile="%outputDirectory%%prefix%%fileName%%postfix%.mkv"
+audioCodec="pcm_s16le"
+videoCodec="ffv1 -level 3"
+
+command="ffmpeg -vsync passthrough -i \"${inputFile}\" "
+command="${command} -vcodec ${videoCodec} -g 1 "
+command="${command} -acodec ${audioCodec}"
+
+
+command="${command} ${outputFile}"
+
+echo $command
+eval $command
diff --git a/normalization/preservation-pdfa-ghostscript.sh b/normalization/preservation-pdfa-ghostscript.sh
new file mode 100644
index 0000000..999b5ed
--- /dev/null
+++ b/normalization/preservation-pdfa-ghostscript.sh
@@ -0,0 +1 @@
+gs -dPDFA -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -dPDFACompatibilityPolicy=1 -sOutputFile="%outputDirectory%%prefix%%fileName%%postfix%.pdf" "%fileFullName%"
diff --git a/normalization/preservation-pdfa-inkscape.sh b/normalization/preservation-pdfa-inkscape.sh
new file mode 100644
index 0000000..6f625cd
--- /dev/null
+++ b/normalization/preservation-pdfa-inkscape.sh
@@ -0,0 +1 @@
+inkscape -z "%fileFullName%" --export-pdf="%outputDirectory%%prefix%%fileName%%postfix%.pdf"
diff --git a/normalization/preservation-pdfa-ps2pdf.sh b/normalization/preservation-pdfa-ps2pdf.sh
new file mode 100644
index 0000000..d499388
--- /dev/null
+++ b/normalization/preservation-pdfa-ps2pdf.sh
@@ -0,0 +1 @@
+ps2pdf -dEPSCrop -dPDFA "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.pdf"
diff --git a/normalization/preservation-svg.sh b/normalization/preservation-svg.sh
new file mode 100644
index 0000000..c2c2642
--- /dev/null
+++ b/normalization/preservation-svg.sh
@@ -0,0 +1,3 @@
+sudo /usr/bin/inkscape "%fileFullName%" --export-plain-svg="%outputDirectory%%prefix%%fileName%%postfix%.svg"
+ sudo chmod 777 "%outputDirectory%%prefix%%fileName%%postfix%.svg"
+ 
\ No newline at end of file
diff --git a/normalization/preservation-tiff.sh b/normalization/preservation-tiff.sh
new file mode 100644
index 0000000..e9b6a7f
--- /dev/null
+++ b/normalization/preservation-tiff.sh
@@ -0,0 +1 @@
+convert "%fileFullName%" +compress "%outputDirectory%%prefix%%fileName%%postfix%.tif"
diff --git a/normalization/preservation-wav.sh b/normalization/preservation-wav.sh
new file mode 100644
index 0000000..1713431
--- /dev/null
+++ b/normalization/preservation-wav.sh
@@ -0,0 +1 @@
+ffmpeg -i "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.wav"
diff --git a/transcription/ocr.sh b/transcription/ocr.sh
new file mode 100644
index 0000000..9831706
--- /dev/null
+++ b/transcription/ocr.sh
@@ -0,0 +1,4 @@
+ocrfiles="%SIPObjectsDirectory%metadata/OCRfiles"
+test -d "$ocrfiles" || mkdir -p "$ocrfiles"
+
+tesseract %fileFullName% "$ocrfiles/%fileName%"
diff --git a/validation/jhove.py b/validation/jhove.py
new file mode 100644
index 0000000..2979133
--- /dev/null
+++ b/validation/jhove.py
@@ -0,0 +1,84 @@
+import json
+import subprocess
+import sys
+
+from lxml import etree
+
+class JhoveException(Exception):
+    pass
+
+def parse_jhove_data(target):
+    args = ['jhove', '-h', 'xml', target]
+    try:
+        output = subprocess.check_output(args)
+    except subprocess.CalledProcessError:
+        raise JhoveException("Jhove failed when running: " + ' '.join(args))
+
+    return etree.fromstring(output)
+
+def get_status(doc):
+    status = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}status')
+    if status is None:
+        raise JhoveException("Unable to find status!")
+
+    return status.text
+
+def get_outcome(status, format=None):
+    # JHOVE returns "bytestream" for unrecognized file formats.
+    # That can include unrecognized or malformed PDFs, JPEG2000s, etc.
+    # Since we're whitelisting the formats we're passing in,
+    # "bytestream" indicates that the format is not in fact well-formed
+    # regardless of what the status reads.
+    if format == "bytestream":
+        return "fail"
+
+    if status == "Well-Formed and valid":
+        return "pass" 
+    elif status == "Well-Formed, but not valid":
+        return "partial pass" 
+    else:
+        return "fail" 
+
+def get_format(doc):
+    format = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}format')
+    version = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}version')
+
+    if format is None:
+        format = "Not detected" 
+    else:
+        format = format.text
+
+    if version is not None:
+        version = version.text
+
+    return (format, version)
+
+def format_event_outcome_detail_note(format, version, result):
+    note = 'format="{}";'.format(format)
+    if version is not None:
+        note = note + ' version="{}";'.format(version)
+    note = note + ' result="{}"'.format(result)
+
+    return note
+
+def main(target):
+    try:
+        doc = parse_jhove_data(target)
+        status = get_status(doc)
+        format, version = get_format(doc)
+        outcome = get_outcome(status, format)
+        note = format_event_outcome_detail_note(format, version, status)
+
+        out = {
+            "eventOutcomeInformation": outcome,
+            "eventOutcomeDetailNote": note
+        }
+        print json.dumps(out)
+
+        return 0
+    except JhoveException as e:
+        return e
+
+if __name__ == '__main__':
+    target = sys.argv[1]
+    sys.exit(main(target))
diff --git a/verification/file-exists.sh b/verification/file-exists.sh
new file mode 100644
index 0000000..ac96637
--- /dev/null
+++ b/verification/file-exists.sh
@@ -0,0 +1 @@
+test -f "%outputLocation%"
diff --git a/verification/file-has-nonzero-size.sh b/verification/file-has-nonzero-size.sh
new file mode 100644
index 0000000..f59a888
--- /dev/null
+++ b/verification/file-has-nonzero-size.sh
@@ -0,0 +1 @@
+test -s "%outputLocation%"