-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy patheoat-ocr.sh
executable file
·73 lines (56 loc) · 2.38 KB
/
eoat-ocr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env bash
# ocr.sh: Given a PDF, outputs an OCRed text file.
# Usage: sh ocr.sh filename.pdf 3-letter-language-code
if [ ! -f "/usr/bin/$scriptname" ]; then
eoat_type="local"
myexec=$0
else
eoat_type="system"
myexec="eoat-ocr"
fi
if [ $# -ne 2 ]; then
printf "\nUsage: $myexec PDF_FILE_NAME THREE_LETTER_SOURCE_LANGUAGE_CODE\n\n"
exit 3
fi
filename=$1
lang=$2
trunc_file=`echo $filename |awk -F ".pdf$" '{print $1}'`
pages=`pdfinfo $filename |grep Pages |awk -F" " '{print $2}'`
echo "Found $pages pages to process in $filename."
mkdir $trunc_file
# Separate
echo "Separating PDF into individual files."
pdfseparate -f 1 -l $pages $filename $trunc_file/$trunc_file-%d.pdf
#convert pdf to tif
echo "Converting PDF files to Tiffs for OCR"
for i in `ls $trunc_file/*`; do
echo "Found $i as PDF file"
trunc_tiff=`echo $i |awk -F ".pdf$" '{print $1}'`;
echo "Converting $i to tiff, using $trunc_tiff.tiff as output file";
convert -density 300 $i -depth 8 -background white -alpha Off $trunc_tiff.tiff;
trunc_txt=`ls $trunc_file/*.tiff |head -n 1|awk -F ".tiff" '{print $1}'`;
echo "OCRing $trunc_tiff.tiff"
tesseract -l $lang $trunc_tiff.tiff $trunc_tiff;
echo "Deleting $trunc_file/$trunc_tiff.tiff"
rm -rf $trunc_tiff.tiff;
done;
#recombine file
echo "Recombining text files to $trunc_file.txt"
for i in `ls $trunc_file/*.txt |sort -V`; do cat $i >> $trunc_file.txt; done;
# Fix linebreaks
echo "Fixing linebreaks in $trunc_file.txt"
sed -i -e ':a;N;$!ba;s/\(.\)\n/\1 /g' -e 's/\n/\n\n/' $trunc_file.txt
# Remove hyphenation
echo "Removing hyphenation from $trunc_file.txt"
sed -i 's/\-\ //g' $trunc_file.txt
# Remove page numbers and weird linefeeds
echo "Removing page numbers and line feeds from $trunc_file.txt"
sed -i 's/^[0-9].*\o14//g' $trunc_file.txt
sed -i 's/\o14//g' $trunc_file.txt
# Change dumbquotes to smartquotes
echo "Changing quotes to smartquotes for translate-shell. Unterminated quotes cause issues with line-by-line translation sent via shell; once translated, you may want to change back"
sed -i -zEe 's/\x27\x27/"/g; s/\x27([^\x27]*)\x27/‘\1’/g; s/"([^"]*)"/“\1”/g; ' $trunc_file.txt
# Find unmatched quotes and backticks
sed -i 's/`/‘/g' $trunc_file.txt
sed -i 's/"/”/g' $trunc_file.txt
echo "OCR process complete. Full OCRed text is available at $trunc_file.txt. Individual PDF, TIFF and text files are located in the $trunc_file/ directory"