forked from anjesh/pdf-processor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAbbyyPdfTextExtractorTest.py
71 lines (60 loc) · 2.72 KB
/
AbbyyPdfTextExtractorTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/local/bin/python
import unittest
import sys
import os.path
import glob
import ConfigParser
from pdftools.PdfSeparate import *
from abbyy.AbbyyPdfTextExtractor import *
from urllib2 import HTTPError
class AbbyyPdfTextExtractorTest(unittest.TestCase):
def setUp(self):
self.outdir = "tests/out/abbyy/text"
self.indir = "tests/out/abbyy/pdf"
self.createOrCleanDir(self.outdir)
self.createOrCleanDir(self.indir)
self.configParser = ConfigParser.RawConfigParser()
self.configParser.read('settings.config')
def createOrCleanDir(self, directory):
if not os.path.exists(directory):
os.makedirs(directory)
else:
files = glob.glob(directory)
for f in files:
if os.path.isfile(f):
os.remove(f)
def testScannedPdfPage(self):
pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
pdfSeparate.extractPages()
self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))
try:
abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english")
abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
abbyyPdf.processPdfPage(1);
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
except Exception:
pass
def testScannedPdfPageForUnauthorisec(self):
pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
pdfSeparate.extractPages()
self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))
try:
abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english")
abbyyPdf.setApplicationCredentials('nouser', 'nopassword')
abbyyPdf.processPdfPage(1);
except HTTPError as e:
self.assertEqual(e.code, 401)
self.assertEqual(e.reason, "Unauthorized")
def testScannedPdfPages(self):
pdfSeparate = PdfSeparate('tests/sample-scanned.pdf', self.indir)
pdfSeparate.extractPages()
self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))
self.assertTrue(os.path.isfile(os.path.join(self.indir,"2.pdf")))
try:
abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 2, "english")
abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
abbyyPdf.extractPages();
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
except Exception:
pass