Skip to content
This repository has been archived by the owner on Sep 25, 2019. It is now read-only.

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mistydemeo committed Dec 4, 2014
0 parents commit cb00bad
Show file tree
Hide file tree
Showing 37 changed files with 263 additions and 0 deletions.
1 change: 1 addition & 0 deletions characterization/exiftool.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
exiftool -X "%fileFullName%"
1 change: 1 addition & 0 deletions characterization/ffprobe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ffprobe -i "%fileFullName%" -show_data -show_format -show_error -show_streams -show_chapters -show_private_data -show_versions -print_format xml
1 change: 1 addition & 0 deletions characterization/fits.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ng edu.harvard.hul.ois.fits.Fits -i %relativeLocation%
1 change: 1 addition & 0 deletions characterization/fiwalk.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fiwalk -x %relativeLocation% -c /usr/lib/archivematica/archivematicaCommon/externals/fiwalk_plugins/ficonfig.txt
1 change: 1 addition & 0 deletions characterization/mediainfo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mediainfo --Language=Raw -f --Output=XML "%fileFullName%"
1 change: 1 addition & 0 deletions event_detail/7z-extraction.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo program=\"7z\"\; version=\"`7z | grep Version`\"
1 change: 1 addition & 0 deletions event_detail/convert-normalization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo program=\"convert\"\; version=\"`convert -version | grep Version:`\"
1 change: 1 addition & 0 deletions event_detail/ffmpeg-normalization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo program=\"ffmpeg\"\; version=\"`ffmpeg 2>&1 | grep --ignore-case "FFmpeg version"`\"
1 change: 1 addition & 0 deletions event_detail/ghostscript-normalization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo program=\"Ghostscript\"\; version=\"`gs --version`\"
1 change: 1 addition & 0 deletions event_detail/inkscape-normalization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo program=\"inkscape\"\; version=\"`inkscape -V`\"
1 change: 1 addition & 0 deletions event_detail/maildir-normalization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo "/usr/lib/archivematica/transcoder/transcoderScripts/" "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.mbox"
1 change: 1 addition & 0 deletions event_detail/ps2pdf-normalization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo program=\"ps2pdf\"\; program=\"Ghostscript\"\; version=\"`gs --version`\"
1 change: 1 addition & 0 deletions event_detail/readpst-normalization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo program=\"readpst\"\; version=\"`readpst -V`\"
1 change: 1 addition & 0 deletions event_detail/unrar-extraction.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo program=\"unrar-nonfree\"\; version=\"`unrar-nonfree | grep 'UNRAR'`\"
1 change: 1 addition & 0 deletions extraction/7z.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
7z x -bd -o"%outputDirectory%" "%inputFile%"
16 changes: 16 additions & 0 deletions extraction/rar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os, subprocess, sys

def main(output_directory, compressed_file):
# Note that unrar-free only extracts into the current working directory,
# hence the os.chdir() here
try:
os.chdir(output_directory)
args = ['unrar', '-x', compressed_file]
subprocess.call(args)
except Exception as e:
return e

if __name__ == '__main__':
output_directory = sys.argv[1]
compressed_file = sys.argv[2]
exit(main(output_directory, compressed_file))
30 changes: 30 additions & 0 deletions extraction/tsk_recover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import print_function
import re
import subprocess
import sys

def extract(package, outdir):
# -a extracts only allocated files; we're not capturing unallocated files
try:
process = subprocess.Popen(['tsk_recover', package, '-a', outdir],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
stdout, stderr = process.communicate()

match = re.match(r'Files Recovered: (\d+)', stdout.splitlines()[0])
if match:
if match.groups()[0] == '0':
raise Exception('tsk_recover failed to extract any files with the message: {}'.format(stdout))
else:
print(stdout)
except Exception as e:
return e

return 0

def main(package, outdir):
return extract(package, outdir)

if __name__ == '__main__':
package = sys.argv[1]
outdir = sys.argv[2]
sys.exit(main(package, outdir))
61 changes: 61 additions & 0 deletions id/fido.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os.path
import re
import subprocess
import sys

def file_tool(path):
return subprocess.check_output(['file', path]).strip()

class FidoFailed(Exception):
def __init__(self, stdout, stderr, retcode):
message = """
Fido exited {retcode} and no format was found.
stdout: {stdout}
---
stderr: {stderr}
""".format(stdout=stdout, stderr=stderr, retcode=retcode)
super(FidoFailed, self).__init__(message)

def identify(file_):
# The default buffer size fido uses, 256KB, is too small to be able to detect certain formats
# Formats like office documents and Adobe Illustrator .ai files will be identified as other, less-specific formats
# This larger buffer size is a bit slower and consumes more RAM, so some users may wish to customize this to reduce the buffer size
# See: https://projects.artefactual.com/issues/5941, https://projects.artefactual.com/issues/5731
cmd = ['fido', '-bufsize', '1048576',
'-loadformats', '/usr/lib/archivematica/archivematicaCommon/externals/fido/archivematica_format_extensions.xml',
os.path.abspath(file_)]
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
stdout, stderr = process.communicate()

try:
results = stdout.split('\n')[0].split(',')
except:
raise FidoFailed(stdout, stderr, process.returncode)

if process.returncode != 0 or results[-1] == '"fail"':
raise FidoFailed(stdout, stderr, process.returncode)
else:
puid = results[2]
if re.match('(.+)?fmt\/\d+', puid):
return puid
else:
print >> sys.stderr, "File identified as non-standard Fido code: {id}".format(id=puid)
return ""

def main(argv):
try:
print identify(argv[1])
return 0
except FidoFailed as e:
file_output = file_tool(argv[1])
# FIDO can't currently identify text files with no extension, and this
# is a common enough usecase to special-case it
if 'text' in file_output:
print 'x-fmt/111'
else:
return e
except Exception as e:
return e

if __name__ == '__main__':
exit(main(sys.argv))
16 changes: 16 additions & 0 deletions id/file-by-extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os.path
import subprocess
import sys

def file_tool(path):
return subprocess.check_output(['file', path]).strip()

(_, extension) = os.path.splitext(sys.argv[1])
if extension:
print extension.lower()
else:
# Plaintext files frequently have no extension, but are common to identify.
# file is pretty smart at figuring these out.
file_output = file_tool(sys.argv[1])
if 'text' in file_output:
print '.txt'
1 change: 1 addition & 0 deletions normalization/access-h264.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ffmpeg -i "%fileFullName%" -vcodec libx264 -pix_fmt yuv420p -preset medium -crf 18 "%outputDirectory%%prefix%%fileName%%postfix%.mp4"
3 changes: 3 additions & 0 deletions normalization/access-jpeg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
convert "%fileFullName%" -sampling-factor 4:4:4 -quality 60 -layers merge
"%outputDirectory%%prefix%%fileName%%postfix%.jpg"

1 change: 1 addition & 0 deletions normalization/access-mp3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ffmpeg -i "%fileFullName%" -ac 2 -ab 192000 "%outputDirectory%%prefix%%fileName%%postfix%.mp3"
1 change: 1 addition & 0 deletions normalization/default-access.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cp -R "%inputFile%" "%outputDirectory%%prefix%%fileName%%postfix%%fileExtensionWithDot%"
1 change: 1 addition & 0 deletions normalization/default-thumbnail.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cp -R "/var/archivematica/sharedDirectory/sharedMicroServiceTasksConfigs/transcoder/defaultIcons/default.jpg" "%outputDirectory%%postfix%.jpg"
3 changes: 3 additions & 0 deletions normalization/jpeg-thumbnail.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
convert "%fileFullName%" -thumbnail 100x100 -layers merge
"%outputDirectory%%postfix%.jpg"

1 change: 1 addition & 0 deletions normalization/maildir-to-mbox.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"/usr/lib/archivematica/MCPClient/clientScripts/archivematicaMaildirToMbox.py" "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.mbox"
16 changes: 16 additions & 0 deletions normalization/preservation-ffv1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

inputFile="%fileFullName%"
outputFile="%outputDirectory%%prefix%%fileName%%postfix%.mkv"
audioCodec="pcm_s16le"
videoCodec="ffv1 -level 3"

command="ffmpeg -vsync passthrough -i \"${inputFile}\" "
command="${command} -vcodec ${videoCodec} -g 1 "
command="${command} -acodec ${audioCodec}"


command="${command} ${outputFile}"

echo $command
eval $command
1 change: 1 addition & 0 deletions normalization/preservation-pdfa-ghostscript.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
gs -dPDFA -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -dPDFACompatibilityPolicy=1 -sOutputFile="%outputDirectory%%prefix%%fileName%%postfix%.pdf" "%fileFullName%"
1 change: 1 addition & 0 deletions normalization/preservation-pdfa-inkscape.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
inkscape -z "%fileFullName%" --export-pdf="%outputDirectory%%prefix%%fileName%%postfix%.pdf"
1 change: 1 addition & 0 deletions normalization/preservation-pdfa-ps2pdf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ps2pdf -dEPSCrop -dPDFA "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.pdf"
3 changes: 3 additions & 0 deletions normalization/preservation-svg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sudo /usr/bin/inkscape "%fileFullName%" --export-plain-svg="%outputDirectory%%prefix%%fileName%%postfix%.svg"
sudo chmod 777 "%outputDirectory%%prefix%%fileName%%postfix%.svg"

1 change: 1 addition & 0 deletions normalization/preservation-tiff.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
convert "%fileFullName%" +compress "%outputDirectory%%prefix%%fileName%%postfix%.tif"
1 change: 1 addition & 0 deletions normalization/preservation-wav.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ffmpeg -i "%fileFullName%" "%outputDirectory%%prefix%%fileName%%postfix%.wav"
4 changes: 4 additions & 0 deletions transcription/ocr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ocrfiles="%SIPObjectsDirectory%metadata/OCRfiles"
test -d "$ocrfiles" || mkdir -p "$ocrfiles"

tesseract %fileFullName% "$ocrfiles/%fileName%"
84 changes: 84 additions & 0 deletions validation/jhove.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import json
import subprocess
import sys

from lxml import etree

class JhoveException(Exception):
pass

def parse_jhove_data(target):
args = ['jhove', '-h', 'xml', target]
try:
output = subprocess.check_output(args)
except subprocess.CalledProcessError:
raise JhoveException("Jhove failed when running: " + ' '.join(args))

return etree.fromstring(output)

def get_status(doc):
status = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}status')
if status is None:
raise JhoveException("Unable to find status!")

return status.text

def get_outcome(status, format=None):
# JHOVE returns "bytestream" for unrecognized file formats.
# That can include unrecognized or malformed PDFs, JPEG2000s, etc.
# Since we're whitelisting the formats we're passing in,
# "bytestream" indicates that the format is not in fact well-formed
# regardless of what the status reads.
if format == "bytestream":
return "fail"

if status == "Well-Formed and valid":
return "pass"
elif status == "Well-Formed, but not valid":
return "partial pass"
else:
return "fail"

def get_format(doc):
format = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}format')
version = doc.find('.{http://hul.harvard.edu/ois/xml/ns/jhove}repInfo/{http://hul.harvard.edu/ois/xml/ns/jhove}version')

if format is None:
format = "Not detected"
else:
format = format.text

if version is not None:
version = version.text

return (format, version)

def format_event_outcome_detail_note(format, version, result):
note = 'format="{}";'.format(format)
if version is not None:
note = note + ' version="{}";'.format(version)
note = note + ' result="{}"'.format(result)

return note

def main(target):
try:
doc = parse_jhove_data(target)
status = get_status(doc)
format, version = get_format(doc)
outcome = get_outcome(status, format)
note = format_event_outcome_detail_note(format, version, status)

out = {
"eventOutcomeInformation": outcome,
"eventOutcomeDetailNote": note
}
print json.dumps(out)

return 0
except JhoveException as e:
return e

if __name__ == '__main__':
target = sys.argv[1]
sys.exit(main(target))
1 change: 1 addition & 0 deletions verification/file-exists.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test -f "%outputLocation%"
1 change: 1 addition & 0 deletions verification/file-has-nonzero-size.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test -s "%outputLocation%"

0 comments on commit cb00bad

Please sign in to comment.