updated pdfprocessor to handle encrypted pdfs

stats.json changed tests updated readme updated
mabuturabcloudelligent · Jun 19, 2015 · 625184f · 625184f
1 parent 69653ec
commit 625184f
Show file tree

Hide file tree

Showing 10 changed files with 97 additions and 35 deletions.
diff --git a/PdfProcessor.py b/PdfProcessor.py
@@ -13,6 +13,7 @@ class PDFProcessor:
     def __init__(self, filePath, outputDir):
         self.filePath = filePath
         self.outputDir = outputDir
+        self.isEncrypted = False
         self.textContentSize = 0
         self.totalPages = 0
         self.process()
@@ -28,6 +29,10 @@ def process(self):
         self.totalPages = pdfInfo.getPages()
         self.fileSize = pdfInfo.getFileSizeInBytes()
         self.logger.info('Total Pages: %d, File Size: %d bytes', self.totalPages, self.fileSize)
+        self.isEncrypted = pdfInfo.isEncrypted()
+        if self.isEncrypted:
+            self.writeStats()
+            raise Exception('Pdf is encrypted. Can\'t do processing.')
         self.separatePdfPages()
 
     def processToCheckStructured(self):
@@ -47,8 +52,14 @@ def isStructured(self):
         """
         return True if self.textContentSize > (self.totalPages*500) else False
 
+    def getStatus(self):
+        if self.isEncrypted:
+            return "Encrypted"
+        else:
+            return "Structured" if self.isStructured() else "Scanned";
+
     def writeStats(self):
-        stats = {"pages": self.totalPages, "structured": self.isStructured()}
+        stats = {"pages": self.totalPages, "status": self.getStatus()}
         with open(os.path.join(self.outputDir,'stats.json'),'w') as outfile:
             json.dump(stats, outfile)
             self.logger.info('Writing %s to %s', json.dumps(stats), 'stats.json')

diff --git a/pdftools/PdfInfo.py b/pdftools/PdfInfo.py
@@ -21,6 +21,9 @@ def process(self):
                 if label in line:
                     self.info[label] = self.extract(line)
 
+    def isEncrypted(self):
+        return False if (self.info['Encrypted'][:2]=="no") else True
+
     def extract(self, row):
         return row.split(':', 1)[1].strip()
 

diff --git a/readme.md b/readme.md
@@ -11,19 +11,18 @@ Make sure that `pdftotext`, `pdfinfo` and `pdfseparate` are installed in your co
 ### How it works
 
 * Reads the pdf file
-* Uses `pdfinfo` to get the total pages in the pdf and size
-* Uses `pdftotext` to dump the text and compares the size of the extract text content. If the text content size is 500 bytes in average for each page, then it is structured otherwise scanned one.
-* If the pdf is structured, then it uses `pdftotext` to extract the text content page-wise and puts the txt files in the `text` folder.
-* If the pdf is non-structured i.e. scanned, then it uses Abbyy OCR service to extract the text content `TODO`
-* Creates `stats.json` file with the following content (structured = false if scanned)
-
+* Uses `pdfinfo` to get the total pages in the pdf and size and whether it's encrypted
+* If encrypted i.e. "password protected", then it writes `stats.json` with `{ "status":"Encryption", .. }` throws an Exception, and exits from the script.
+* If not encrypted
+  * Uses `pdftotext` to dump the text and compares the size of the extract text content. If the text content size is 500 bytes in average for each page, then it is structured otherwise scanned one.
+  * Uses `pdfseparate` to extract each pdf page and saves in the `pages` folder.
+  * If the pdf is structured, then it uses `pdftotext` to extract the text content page-wise and puts the txt files in the `text` folder.
+  * If the pdf is non-structured i.e. scanned, then it uses Abbyy OCR service to extract the text content `TODO`
+  * Creates `stats.json` file with the following content (status = [Scanned|Structured|Encrypted])
 ```json
-{ "structured": true, "pages": 5 }
+{ "status": "Structured", "pages": 5 }
 ```
 
-* Uses `pdfseparate` to extract each pdf page and saves in the `pages` folder.
-
-
 ### Test
 
 Execute `bash runtest.sh` to run all above tests at once.
@@ -36,8 +35,7 @@ Execute `bash runtest.sh` to run all above tests at once.
 
 ### TODO
 
-* log the events
-* handle exceptions
+* handle more exceptions
 
 
 

diff --git a/run.py b/run.py
@@ -4,12 +4,14 @@
 import ConfigParser
 import ProcessLogger
 import traceback
+from urllib2 import HTTPError, URLError
 
 parser = argparse.ArgumentParser(description='Processes the pdf and extracts the text')
 parser.add_argument('-i','--infile', help='File path of the input pdf file.', required=True)
 parser.add_argument('-o','--outdir', help='File name of the output csv file.', required=True)
 results = parser.parse_args()
 
+pdfProcessor = ""
 try:
     logger = ProcessLogger.getLogger('run')
     logger.info("Processing started at %s ", str(datetime.now()))
@@ -27,10 +29,17 @@
     else:
         pdfProcessor.extractTextFromScannedDoc()
     pdfProcessor.writeStats()
+except URLError as e:
+    logger.error("URLError: %s", e.reason);
+    logger.debug(traceback.format_exception(*sys.exc_info()))
+except HTTPError as e:
+    logger.error("HTTPError: [%s] %s", e.code, e.reason);
+    logger.debug(traceback.format_exception(*sys.exc_info()))
 except OSError as e:
     logger.error("OSError: %s [%s] in %s", e.strerror, e.errno, e.filename);
     logger.debug(traceback.format_exception(*sys.exc_info()))
 except Exception as e:
     logger.error("Exception: %s ", e);
     logger.debug(traceback.format_exception(*sys.exc_info()))
-logger.info("Processing ended at %s ", str(datetime.now()));
+finally:
+    logger.info("Processing ended at %s ", str(datetime.now()));
diff --git a/tests/AbbyyPdfTextExtractorTest.py b/tests/AbbyyPdfTextExtractorTest.py
@@ -7,6 +7,7 @@
 import ConfigParser
 from pdftools.PdfSeparate import *
 from abbyy.AbbyyPdfTextExtractor import *
+from urllib2 import HTTPError
 
 class AbbyyPdfTextExtractorTest(unittest.TestCase):
     def setUp(self):
@@ -29,22 +30,42 @@ def createOrCleanDir(self, directory):
 
 
     def testScannedPdfPage(self):
-        pdfSeparate = PdfSeparate('tests/sample-scanned.pdf', self.indir)
+        pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
         pdfSeparate.extractPages()
         self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))
 
-        abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 5, "english")
-        abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
-        abbyyPdf.processPdfPage(1);
-        self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
+        try:
+            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english")
+            abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
+            abbyyPdf.processPdfPage(1);
+            self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
+        except Exception:
+            pass
+
+    def testScannedPdfPageForUnauthorisec(self):
+        pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
+        pdfSeparate.extractPages()
+        self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))
+        try:
+            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english")
+            abbyyPdf.setApplicationCredentials('nouser', 'nopassword')
+            abbyyPdf.processPdfPage(1);
+        except HTTPError as e:
+            self.assertEqual(e.code, 401)
+            self.assertEqual(e.reason, "Unauthorized")
+
 
     def testScannedPdfPages(self):
-        pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)        
+        pdfSeparate = PdfSeparate('tests/sample-scanned.pdf', self.indir)        
         pdfSeparate.extractPages()
         self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))
+        self.assertTrue(os.path.isfile(os.path.join(self.indir,"2.pdf")))
 
-        abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 2, "english")
-        abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
-        abbyyPdf.extractPages();
-        self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
-        self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
+        try:
+            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 2, "english")
+            abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
+            abbyyPdf.extractPages();
+            self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
+            self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
+        except Exception:
+            pass
diff --git a/tests/PdfInfoTest.py b/tests/PdfInfoTest.py
@@ -14,4 +14,12 @@ def testPdfPages(self):
         pdfInfo.process()
         self.assertEqual(pdfInfo.getPages(), 5)
         self.assertEqual(pdfInfo.getFileSizeInBytes(), 81691)
+        self.assertEqual(pdfInfo.isEncrypted(), False)
 
+    def testEncryptedPdfPages(self):
+        pdfInfo = PdfInfo('tests/sample-scanned-encrypted.pdf')
+        pdfInfo.process()
+        self.assertEqual(pdfInfo.getPages(), 69)
+        self.assertEqual(pdfInfo.getFileSizeInBytes(), 1891519)
+        self.assertEqual(pdfInfo.isEncrypted(), True)
+
diff --git a/tests/PdfProcessorTest.py b/tests/PdfProcessorTest.py
@@ -40,15 +40,25 @@ def testScannedPdfStats(self):
         pdfProcessor.writeStats()
         with open(os.path.join(self.outdir,"stats.json")) as json_file:
             json_data = json.load(json_file)
-            self.assertFalse(json_data['structured'])            
-            self.assertEqual(json_data['pages'], 5)            
+            self.assertEqual(json_data['status'], "Scanned")            
+            self.assertEqual(json_data['pages'], 2)            
+
+    def testEncryptedScannedPdfStats(self):
+        try:
+            pdfProcessor = PDFProcessor('tests/sample-scanned-encrypted.pdf', self.outdir)
+        except Exception as e:
+            self.assertEqual("Pdf is encrypted. Can't do processing.", str(e))
+        with open(os.path.join(self.outdir,"stats.json")) as json_file:
+            json_data = json.load(json_file)
+            self.assertEqual(json_data['status'], "Encrypted")
+            self.assertEqual(json_data['pages'], 69)            
 
     def testStructuredPdfStats(self):
         pdfProcessor = PDFProcessor('tests/sample.pdf', self.outdir)
         pdfProcessor.writeStats()
         with open(os.path.join(self.outdir,"stats.json")) as json_file:
             json_data = json.load(json_file)
-            self.assertTrue(json_data['structured'])            
+            self.assertEqual(json_data['status'], "Structured")            
             self.assertEqual(json_data['pages'], 5)            
 
     def testStructuredPdfExtractPages(self):
@@ -67,10 +77,12 @@ def testSeparatePdfPages(self):
         self.assertTrue(os.path.isfile(os.path.join(self.outdir,"pages","5.pdf")))
 
     def testScannedPdfExtractPages(self):
-        pdfProcessor = PDFProcessor('tests/sample-scanned-1.pdf', self.outdir)
-        pdfProcessor.setConfigParser(self.configParser)        
-        self.assertFalse(pdfProcessor.isStructured())
-        pdfProcessor.extractTextFromScannedDoc()
-        self.assertTrue(os.path.isdir(os.path.join(self.outdir,"text")))
-        self.assertTrue(os.path.isfile(os.path.join(self.outdir,"text","1.txt")))
-        self.assertTrue(os.path.isfile(os.path.join(self.outdir,"text","2.txt")))
+        try:
+            pdfProcessor = PDFProcessor('tests/sample-scanned-1.pdf', self.outdir)
+            pdfProcessor.setConfigParser(self.configParser)        
+            self.assertFalse(pdfProcessor.isStructured())
+            pdfProcessor.extractTextFromScannedDoc()
+            self.assertTrue(os.path.isdir(os.path.join(self.outdir,"text")))
+            self.assertTrue(os.path.isfile(os.path.join(self.outdir,"text","1.txt")))
+        except Exception:
+            pass
diff --git a/tests/sample-scanned-1.pdf b/tests/sample-scanned-1.pdf
diff --git a/tests/sample-scanned-encrypted.pdf b/tests/sample-scanned-encrypted.pdf
diff --git a/tests/sample-scanned.pdf b/tests/sample-scanned.pdf