-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathconvert_pdf.py
40 lines (32 loc) · 1.25 KB
/
convert_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding: utf-8 -*-
from io import BytesIO
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.converter import TextConverter, HTMLConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
def convert_pdf(input_file, format='text', codec='utf-8'):
"""Convert PDF file to text or html.
Args:
input_file (str): Input PDF file.
format (str): Format text or html.
codec (str): Codec for encode the text.
Returns:
str: Return text or html from PDF file.
"""
manager = PDFResourceManager()
output = BytesIO()
laparams = LAParams()
if format == 'text':
converter = TextConverter(manager, output, codec=codec, laparams=laparams)
elif format == 'html':
converter = HTMLConverter(manager, output, codec=codec, laparams=laparams)
with open(input_file, 'rb') as f1:
interpreter = PDFPageInterpreter(manager, converter)
for page in PDFPage.get_pages(f1,
caching=True,
check_extractable=True):
interpreter.process_page(page)
converter.close()
text = output.getvalue()
output.close()
return text.decode()