-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathextract_using_poppler.py
163 lines (137 loc) · 5.89 KB
/
extract_using_poppler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# DTE Aerial Photo Collection curation project
# Workflow for extracting JPGs and metadata from PDFs
# Poppler Solution
# Garrett Morton, Sam Sciolla
# SI 699
# Written and tested using Python 3.7.0
# os documentation: https://docs.python.org/3/library/os.html#module-os
# subprocess documentation: https://docs.python.org/3/library/subprocess.html
# This implementation of Poppler requires a Linux operating system, or an equivalent setup.
# Poppler documentation: https://poppler.freedesktop.org/
# PyGObject documentation: https://pygobject.readthedocs.io/en/latest/index.html
# PyGObject and poppler API: https://lazka.github.io/pgi-docs/#Poppler-0.18
# standard modules
import time
import json
import os
import subprocess
# third-party modules
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Gio, Poppler, cairo
# local modules
import misc_functions
## Functions
# Identify embedded links in Index PDF and collect metadata
def pull_links_from_index(relative_path):
index_pdf_file_name = relative_path.split('/')[-1]
print('// Index: {} //'.format(index_pdf_file_name))
gio_file_object = Gio.File.new_for_path(relative_path)
new_pdf_object = Poppler.Document.new_from_gfile(gio_file_object)
# print(new_pdf_object.get_pdf_version())
if new_pdf_object.get_n_pages() > 1:
print('?? More than one page: {} ??'.format(str(new_pdf_object.getNumPages())))
pdf_page = new_pdf_object.get_page(0)
size_result = pdf_page.get_size()
page_size = [size_result[0], size_result[1]]
# annot_mappings = pdf_page.get_annot_mapping()
# print(len(annot_mappings))
# for annot_mapping in annot_mappings:
# annot_rect_obj = annot_mapping.area
# annot_coords = {
# 'x1': annot_rect_obj.x1,
# 'x2': annot_rect_obj.x2,
# 'y1': annot_rect_obj.y1,
# 'y2': annot_rect_obj.y2
# }
# print(annot_coords)
# annot = annot_mapping.annot
# print(annot.get_annot_type())
links = []
link_mappings = pdf_page.get_link_mapping()
for link_mapping in link_mappings:
rect_obj = link_mapping.area
link_coords = {
'x1': rect_obj.x1,
'x2': rect_obj.x2,
'y1': rect_obj.y1,
'y2': rect_obj.y2
}
action = link_mapping.action
file_name = action.launch.file_name
link_dictionary = {
'Photo File Name': file_name,
'Link Coordinates': link_coords
}
links.append(link_dictionary)
links = sorted(links, key=lambda x: x['Photo File Name'])
print('** Number of links identified: {} **'.format(str(len(links))))
index_file_name = index_pdf_file_name.split('/')[-1]
index_file_metadata = {
'Index File Name': index_file_name,
'Source Relative Path': relative_path,
'Links': links,
'Page Size': page_size
}
return index_file_metadata
# Identify JPEG bytestream in Image PDF, write it to a new file, and collect image-level metadata
def extract_jpg_from_pdf(relative_path, output_location=''):
image_pdf_file_name = relative_path.split('/')[-1]
print('// Image: {} //'.format(image_pdf_file_name))
absolute_path = os.getcwd() + '/' + relative_path
gio_file_object = Gio.File.new_for_path(absolute_path)
new_pdf_object = Poppler.Document.new_from_gfile(gio_file_object)
# print(new_pdf_object.get_pdf_version())
if new_pdf_object.get_n_pages() > 1:
print('** More than one page: {} **'.format(str(test_image_pdf_file_object.getNumPages())))
pdf_page = new_pdf_object.get_page(0)
# This block is used to enable the collection of the dimensions of the image (height, width).
# However, this seems to add a couple of seconds to the processing time required.
image_mappings = pdf_page.get_image_mapping()
if len(image_mappings) > 1:
print("?? More than one image: {} ??".format(str(len(image_mappings))))
image_identifier = image_mappings[0].image_id
surface_object = pdf_page.get_image(image_identifier)
image_object = surface_object.map_to_image(None)
# bytestream = image_object.get_data().tobytes()
image_pdf_metadata = {
'Image File Name': image_pdf_file_name,
'Source Relative Path': relative_path,
'Height': image_object.get_height(),
'Width': image_object.get_width()
}
# This uses one of the poppler-utils, a command-line tool called pdfimages.
# I haven't yet found a way to accomplish the following through the Python interfaces.
# This particular command-line tool can also output images as TIFFs.
identifier = image_pdf_file_name.replace('.pdf', '')
new_image_file_name = 'dte_aerial_' + identifier + '.jpg'
subprocess.run(['pdfimages', '-j', relative_path, (output_location + new_image_file_name).replace('.jpg', '')])
image_pdf_metadata['Created Image File Name'] = new_image_file_name
return image_pdf_metadata
# Manage function invocations and write resulting metadata to a JSON file
def run_poppler_workflow(pdf_file_paths, output_location, output_name):
poppler_start = time.time()
image_metadata_dicts = []
index_metadata_dicts = []
for pdf_file_path in pdf_file_paths:
if 'Index' in pdf_file_path:
new_index_metadata_dict = pull_links_from_index(pdf_file_path)
index_metadata_dicts.append(new_index_metadata_dict)
else:
image_metadata_dict = extract_jpg_from_pdf(pdf_file_path, output_location)
image_metadata_dicts.append(image_metadata_dict)
sample_poppler_batch_metadata = {}
sample_poppler_batch_metadata['Index Records'] = index_metadata_dicts
sample_poppler_batch_metadata['Image Records'] = image_metadata_dicts
poppler_metadata_file = open('output/poppler/' + output_name, 'w', encoding='utf-8')
poppler_metadata_file.write(json.dumps(sample_poppler_batch_metadata, indent=4))
poppler_metadata_file.close()
poppler_end = time.time()
print('** Time to Run: {} **'.format(str(poppler_end - poppler_start)))
## Main Program
if __name__=="__main__":
print("\n** DTE Aerial Batch Processing Script **")
print("** Poppler Solution **")
output_location = 'output/poppler/'
pdf_file_paths = misc_functions.collect_relative_paths_for_files('input/part1/macomb/1961')
run_poppler_workflow(pdf_file_paths, output_location, 'sample_poppler_batch_metadata.json')