-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChatGPTSuggestedSolution.py
62 lines (44 loc) · 1.79 KB
/
ChatGPTSuggestedSolution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
"""
Created on Thu May 18 17:16:37 2023
@author: Armando Anzellini
"""
import PyPDF2
import re
import pytesseract
from pdf2image import convert_from_bytes, convert_from_path
from io import BytesIO
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
pdf_dir = 'D:\\Users\\Armando\\OneDrive\\Documents\\AuthorPapers (in progress)\\Forensic Assumptions\\Calibration-PDFs\\'
pdf_path = pdf_dir + 'Trinkaus_1984.pdf'
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
num_pages = pdf_reader.numPages
lines = []
# Iterate through each page of the PDF
for page_num in range(num_pages):
# Convert the page to an image
images = convert_from_path(pdf_path, dpi=300, first_page=page_num+1, last_page=page_num+1)
# Perform OCR on each image
ocr_text = pytesseract.image_to_string(images[0])
# Process the OCR text
ocr_lines = re.split(r'([A-Z].*?\.)', ocr_text)
# Filter out lines that represent tables or graphs
filtered_lines = [line for line in ocr_lines if not all(char.isnumeric() for char in line)]
# Add the filtered lines to the main lines list
lines += filtered_lines
seen = set()
lines = [line for line in lines if not (line in seen or seen.add(line))]
lines = [re.sub(r'-\n', '', line) for line in lines]
lines = [re.sub(r'\n', ' ', line) for line in lines]
lines = [line.strip() for line in lines]
combined_lines = []
current_sentence = ""
for line in lines:
line = line.strip() # Remove leading/trailing whitespace
if not line.endswith('.'):
current_sentence += line
else:
current_sentence += line
combined_lines.append(current_sentence)
current_sentence = ""