Skip to content

Commit

Permalink
updated pdfprocessor to handle encrypted pdfs
Browse files Browse the repository at this point in the history
stats.json changed
tests updated
readme updated
  • Loading branch information
anjesh committed Jun 19, 2015
1 parent 69653ec commit 625184f
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 35 deletions.
13 changes: 12 additions & 1 deletion PdfProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class PDFProcessor:
def __init__(self, filePath, outputDir):
self.filePath = filePath
self.outputDir = outputDir
self.isEncrypted = False
self.textContentSize = 0
self.totalPages = 0
self.process()
Expand All @@ -28,6 +29,10 @@ def process(self):
self.totalPages = pdfInfo.getPages()
self.fileSize = pdfInfo.getFileSizeInBytes()
self.logger.info('Total Pages: %d, File Size: %d bytes', self.totalPages, self.fileSize)
self.isEncrypted = pdfInfo.isEncrypted()
if self.isEncrypted:
self.writeStats()
raise Exception('Pdf is encrypted. Can\'t do processing.')
self.separatePdfPages()

def processToCheckStructured(self):
Expand All @@ -47,8 +52,14 @@ def isStructured(self):
"""
return True if self.textContentSize > (self.totalPages*500) else False

def getStatus(self):
if self.isEncrypted:
return "Encrypted"
else:
return "Structured" if self.isStructured() else "Scanned";

def writeStats(self):
stats = {"pages": self.totalPages, "structured": self.isStructured()}
stats = {"pages": self.totalPages, "status": self.getStatus()}
with open(os.path.join(self.outputDir,'stats.json'),'w') as outfile:
json.dump(stats, outfile)
self.logger.info('Writing %s to %s', json.dumps(stats), 'stats.json')
Expand Down
3 changes: 3 additions & 0 deletions pdftools/PdfInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def process(self):
if label in line:
self.info[label] = self.extract(line)

def isEncrypted(self):
return False if (self.info['Encrypted'][:2]=="no") else True

def extract(self, row):
return row.split(':', 1)[1].strip()

Expand Down
22 changes: 10 additions & 12 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,18 @@ Make sure that `pdftotext`, `pdfinfo` and `pdfseparate` are installed in your co
### How it works

* Reads the pdf file
* Uses `pdfinfo` to get the total pages in the pdf and size
* Uses `pdftotext` to dump the text and compares the size of the extract text content. If the text content size is 500 bytes in average for each page, then it is structured otherwise scanned one.
* If the pdf is structured, then it uses `pdftotext` to extract the text content page-wise and puts the txt files in the `text` folder.
* If the pdf is non-structured i.e. scanned, then it uses Abbyy OCR service to extract the text content `TODO`
* Creates `stats.json` file with the following content (structured = false if scanned)

* Uses `pdfinfo` to get the total pages in the pdf and size and whether it's encrypted
* If encrypted i.e. "password protected", then it writes `stats.json` with `{ "status":"Encryption", .. }` throws an Exception, and exits from the script.
* If not encrypted
* Uses `pdftotext` to dump the text and compares the size of the extract text content. If the text content size is 500 bytes in average for each page, then it is structured otherwise scanned one.
* Uses `pdfseparate` to extract each pdf page and saves in the `pages` folder.
* If the pdf is structured, then it uses `pdftotext` to extract the text content page-wise and puts the txt files in the `text` folder.
* If the pdf is non-structured i.e. scanned, then it uses Abbyy OCR service to extract the text content `TODO`
* Creates `stats.json` file with the following content (status = [Scanned|Structured|Encrypted])
```json
{ "structured": true, "pages": 5 }
{ "status": "Structured", "pages": 5 }
```

* Uses `pdfseparate` to extract each pdf page and saves in the `pages` folder.


### Test

Execute `bash runtest.sh` to run all above tests at once.
Expand All @@ -36,8 +35,7 @@ Execute `bash runtest.sh` to run all above tests at once.

### TODO

* log the events
* handle exceptions
* handle more exceptions



Expand Down
11 changes: 10 additions & 1 deletion run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
import ConfigParser
import ProcessLogger
import traceback
from urllib2 import HTTPError, URLError

parser = argparse.ArgumentParser(description='Processes the pdf and extracts the text')
parser.add_argument('-i','--infile', help='File path of the input pdf file.', required=True)
parser.add_argument('-o','--outdir', help='File name of the output csv file.', required=True)
results = parser.parse_args()

pdfProcessor = ""
try:
logger = ProcessLogger.getLogger('run')
logger.info("Processing started at %s ", str(datetime.now()))
Expand All @@ -27,10 +29,17 @@
else:
pdfProcessor.extractTextFromScannedDoc()
pdfProcessor.writeStats()
except URLError as e:
logger.error("URLError: %s", e.reason);
logger.debug(traceback.format_exception(*sys.exc_info()))
except HTTPError as e:
logger.error("HTTPError: [%s] %s", e.code, e.reason);
logger.debug(traceback.format_exception(*sys.exc_info()))
except OSError as e:
logger.error("OSError: %s [%s] in %s", e.strerror, e.errno, e.filename);
logger.debug(traceback.format_exception(*sys.exc_info()))
except Exception as e:
logger.error("Exception: %s ", e);
logger.debug(traceback.format_exception(*sys.exc_info()))
logger.info("Processing ended at %s ", str(datetime.now()));
finally:
logger.info("Processing ended at %s ", str(datetime.now()));
43 changes: 32 additions & 11 deletions tests/AbbyyPdfTextExtractorTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import ConfigParser
from pdftools.PdfSeparate import *
from abbyy.AbbyyPdfTextExtractor import *
from urllib2 import HTTPError

class AbbyyPdfTextExtractorTest(unittest.TestCase):
def setUp(self):
Expand All @@ -29,22 +30,42 @@ def createOrCleanDir(self, directory):


def testScannedPdfPage(self):
pdfSeparate = PdfSeparate('tests/sample-scanned.pdf', self.indir)
pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
pdfSeparate.extractPages()
self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))

abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 5, "english")
abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
abbyyPdf.processPdfPage(1);
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
try:
abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english")
abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
abbyyPdf.processPdfPage(1);
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
except Exception:
pass

def testScannedPdfPageForUnauthorisec(self):
pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
pdfSeparate.extractPages()
self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))
try:
abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english")
abbyyPdf.setApplicationCredentials('nouser', 'nopassword')
abbyyPdf.processPdfPage(1);
except HTTPError as e:
self.assertEqual(e.code, 401)
self.assertEqual(e.reason, "Unauthorized")


def testScannedPdfPages(self):
pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
pdfSeparate = PdfSeparate('tests/sample-scanned.pdf', self.indir)
pdfSeparate.extractPages()
self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))
self.assertTrue(os.path.isfile(os.path.join(self.indir,"2.pdf")))

abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 2, "english")
abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
abbyyPdf.extractPages();
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
try:
abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 2, "english")
abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
abbyyPdf.extractPages();
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
except Exception:
pass
8 changes: 8 additions & 0 deletions tests/PdfInfoTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,12 @@ def testPdfPages(self):
pdfInfo.process()
self.assertEqual(pdfInfo.getPages(), 5)
self.assertEqual(pdfInfo.getFileSizeInBytes(), 81691)
self.assertEqual(pdfInfo.isEncrypted(), False)

def testEncryptedPdfPages(self):
pdfInfo = PdfInfo('tests/sample-scanned-encrypted.pdf')
pdfInfo.process()
self.assertEqual(pdfInfo.getPages(), 69)
self.assertEqual(pdfInfo.getFileSizeInBytes(), 1891519)
self.assertEqual(pdfInfo.isEncrypted(), True)

32 changes: 22 additions & 10 deletions tests/PdfProcessorTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,25 @@ def testScannedPdfStats(self):
pdfProcessor.writeStats()
with open(os.path.join(self.outdir,"stats.json")) as json_file:
json_data = json.load(json_file)
self.assertFalse(json_data['structured'])
self.assertEqual(json_data['pages'], 5)
self.assertEqual(json_data['status'], "Scanned")
self.assertEqual(json_data['pages'], 2)

def testEncryptedScannedPdfStats(self):
try:
pdfProcessor = PDFProcessor('tests/sample-scanned-encrypted.pdf', self.outdir)
except Exception as e:
self.assertEqual("Pdf is encrypted. Can't do processing.", str(e))
with open(os.path.join(self.outdir,"stats.json")) as json_file:
json_data = json.load(json_file)
self.assertEqual(json_data['status'], "Encrypted")
self.assertEqual(json_data['pages'], 69)

def testStructuredPdfStats(self):
pdfProcessor = PDFProcessor('tests/sample.pdf', self.outdir)
pdfProcessor.writeStats()
with open(os.path.join(self.outdir,"stats.json")) as json_file:
json_data = json.load(json_file)
self.assertTrue(json_data['structured'])
self.assertEqual(json_data['status'], "Structured")
self.assertEqual(json_data['pages'], 5)

def testStructuredPdfExtractPages(self):
Expand All @@ -67,10 +77,12 @@ def testSeparatePdfPages(self):
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"pages","5.pdf")))

def testScannedPdfExtractPages(self):
pdfProcessor = PDFProcessor('tests/sample-scanned-1.pdf', self.outdir)
pdfProcessor.setConfigParser(self.configParser)
self.assertFalse(pdfProcessor.isStructured())
pdfProcessor.extractTextFromScannedDoc()
self.assertTrue(os.path.isdir(os.path.join(self.outdir,"text")))
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"text","1.txt")))
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"text","2.txt")))
try:
pdfProcessor = PDFProcessor('tests/sample-scanned-1.pdf', self.outdir)
pdfProcessor.setConfigParser(self.configParser)
self.assertFalse(pdfProcessor.isStructured())
pdfProcessor.extractTextFromScannedDoc()
self.assertTrue(os.path.isdir(os.path.join(self.outdir,"text")))
self.assertTrue(os.path.isfile(os.path.join(self.outdir,"text","1.txt")))
except Exception:
pass
Binary file modified tests/sample-scanned-1.pdf
Binary file not shown.
Binary file added tests/sample-scanned-encrypted.pdf
Binary file not shown.
Binary file modified tests/sample-scanned.pdf
Binary file not shown.

0 comments on commit 625184f

Please sign in to comment.