forked from jkingsman/JSON-QAnon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollate.py
executable file
·295 lines (222 loc) · 10.5 KB
/
collate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/env python3
import copy
import json
import os
import re
from bs4 import BeautifulSoup
import yaml
# location of 1.htm, 2.htm, etc.
PAGES_DIRECTORY = 'qposts.online/page'
# when False, trim stray whitepaces from links in posts+refs; see explanation in clean_up_raw_text()
KEEP_ORIGINAL_WHITESPACE = False
def extract_metadata_block(meta_block):
"""
Extracts author + tripcode, source site + board, and link if applicable.
Returns an object of what it finds.
"""
collated_metadata = {}
# extract the span with the name+tripcode in it
author_container = meta_block.find('span', 'name')
# extract the bold/strong text -- i.e. the main name
author = author_container.find('strong').getText()
assert len(author) > 0, 'Author name not found!!'
collated_metadata['author'] = author
# remove the main name, leaving only the tripcode if applicable (and strip l/r whitespace)
author_container.find('strong').decompose()
maybe_tripcode = author_container.getText().strip()
if maybe_tripcode:
collated_metadata['tripcode'] = maybe_tripcode
# extract source board + site block
source_container = meta_block.find('span', 'source')
# extract the bold/strong text -- i.e. the board name
board = source_container.find('strong').getText()
assert len(board) > 0, 'Board name not found!!'
collated_metadata['source'] = {}
collated_metadata['source']['board'] = board
# remove the board name, leaving only the site (and maybe link if applicable)
source_container.find('strong').decompose()
# get thread link if we have it
maybe_thread_link = source_container.find('a')
if maybe_thread_link:
collated_metadata['source']['link'] = maybe_thread_link['href']
maybe_thread_link.decompose()
# we've extracted board name and link if we have it; all that's left is the site
site = source_container.getText().strip()
assert site, 'Site not found!!'
collated_metadata['source']['site'] = site
# attach timestamp
collated_metadata['time'] = int(meta_block.find('span', 'time').getText())
# attach id
collated_metadata['id'] = int(meta_block.find('span', 'num').getText())
return collated_metadata
def extract_images(post_block):
"""
Extracts image filename + uploaded image name for all images in a post/reference.
Returns a list of objects containing filename + uploaded name
"""
images_container = post_block.find('div', 'images', recursive=False)
if not images_container:
return None
# well laid out figs + figcaptions make life easy for images + image names
images = images_container.findAll('figure', recursive=False)
return [{
'file': os.path.split(image.find('a')['href'])[1], # filename on disk
'name': image.find('figcaption').getText() # filename as posted
} for image in images]
def extract_body(post_block):
"""
Extracts the main body text as plaintext less any referenced divs, images, html tags, etc.
Returns a string; newlines indicated by literal \n.
During body extraction, I decompose a number of elements (including divs, which contain post
references) which basically vaporizes them. Since we need the (post) references later to extract
and python is pass by reference*, we need to duplicate the object.
* if you pull an https://xkcd.com/386/ and say something like "ackchyually in python, object
references are passed by value..." I will find you and smack you
"""
post_block_copy = copy.copy(post_block)
# just attempt to find the main text content; some main posts have a div for this, some
# don't, and no references have it so try/catch
try:
content_div = post_block_copy.find('div', 'text')
if content_div:
post_block_copy = content_div
except AttributeError:
pass
# this is random div noise (unlikely) or a referenced post (almost always); regardless, we don't
# want it/them
divs = post_block_copy.findAll('div')
for div in divs:
div.decompose()
# bs4 thinks these tags need a separator when rendering with get_text(); who knows why...
# Unwrapping them seems to solve it. If any other tags that need to be unwrapped pop up, throw
# them in tags_to_unwrap
tags_to_unwrap = ['abbr', 'em']
for tag_to_unwrap in tags_to_unwrap:
instances_to_unwrap = post_block_copy.findAll(tag_to_unwrap)
for instance_to_unwrap in instances_to_unwrap:
instance_to_unwrap.unwrap()
# Get your pitchforks ready. I don't know why bs4 behaves this way but for some reason it's
# throwing separators where there shouldn't be after unwrapping the abbrs but extracting and
# reparsing seems to fix it. I hate it; I don't understand it; it works; it stays.
post_block_copy_duplicate = BeautifulSoup(str(post_block_copy), 'html.parser')
raw_post_text = post_block_copy_duplicate.get_text(separator="\n")
return clean_up_raw_text(raw_post_text)
def extract_references(post_block):
"""
Extracts the referenced posts from the main post block and returns a list of posts, which always
contains the text that referred to it in the original post (e.g. >>123456) and can contain image
objects + text objects.
Returns a list of post objects.
"""
refs = post_block.findAll('div', 'op')
if not refs:
return None
collated_refs = []
for ref in refs:
collated_ref = {}
# the referring text is always the immediately previous sibling of the reference
collated_ref['reference'] = ref.previous_sibling.getText()
# extract reference text if we have it
maybe_text = extract_body(ref)
if maybe_text:
collated_ref['text'] = clean_up_raw_text(maybe_text)
# extract the reference's image if we have any
maybe_images = extract_images(ref)
if maybe_images:
collated_ref['images'] = maybe_images
collated_refs.append(collated_ref)
return collated_refs
def clean_up_emails(post_block):
"""
This a dumb way to handle this but the post site uses a server-side email protection script (I
guess for anti-spam) and it's a little overzealous (note this does not show up in the original
Q posts; these are an artifact introduced by the current host I'm scraping from). Thankfully,
usage is minimal so I just wrote a function to slot them in from the known list. If
significantly more posts are added that trip the protection system or it changes (or the
timestamps are changed but I assume those to be immutable) this will need additional TLC.
"""
if post_block['post_metadata']['time'] == 1526767434:
post_block['post_metadata']['author'] = 'NowC@mesTHEP@in—-23!!!'
# Q sure liked this link; three separate posts using it
if post_block['post_metadata']['time'] in [1588693786, 1585242439, 1553795409]:
post_block['text'] = post_block['text'].replace('email\xa0protected]',
'https://uscode.house.gov/view.xhtml?path=/prelim@title'
'18/part1/chapter115&edition=prelim')
return post_block
def clean_up_raw_text(text):
"""
This corrects some minor oddities in spacing/link text. These show up in the original posts
(as far as I can tell) so removing them technically changes the content of original or
referenced posts. If this is an issue, set KEEP_ORIGINAL_WHITESPACE to True and this will be
short-circuited.
"""
if KEEP_ORIGINAL_WHITESPACE:
return text
# eliminate spaces after http://
http_whitespace_regex = re.compile(r"http:\/\/\s+")
text = http_whitespace_regex.sub('http://', text)
# eliminate spaces after https://
https_whitespace_regex = re.compile(r"https:\/\/\s+")
text = https_whitespace_regex.sub('https://', text)
# tuples of find/replace for known bad URLs
misc_spaced_url_corrections = [
('twitter. com', 'twitter.com'),
('theguardian. com', 'theguardian.com'),
]
for search, replacement in misc_spaced_url_corrections:
text = text.replace(search, replacement)
return text
collected_posts = []
# loop through all html files in the directory to be scanned
entry_count = len(os.listdir(PAGES_DIRECTORY))
current_entry = 1
for entry in os.scandir(PAGES_DIRECTORY):
print(f"Processing entry {current_entry} of {entry_count}")
current_entry += 1
# # helpful for debugging -- skip all files but this one
# if entry.name != '1.html':
# continue
# parse the page html
soup = BeautifulSoup(open(entry.path), 'html.parser')
# extract all posts
posts = soup.findAll('div', {'class': 'post', 'data-timestamp': True})
for post in posts:
collated_post = {}
# yank metadata
meta_container = post.find('div', 'meta')
collated_post['post_metadata'] = extract_metadata_block(meta_container)
# # helpful for debugging -- append src file to metadata
# collated_post['post_metadata']['filename'] = entry.name
# # helpful for debugging -- skip all posts but this ID
# # requires scrape_metadata to be appended above
# if collated_post['post_metadata']['id'] != 4939:
# continue
# break out main meat of the post for easier manipulation
post_body = post.find('div', 'message')
# yank images
extracted_images = extract_images(post_body)
if extracted_images:
collated_post['images'] = extracted_images
# yank main post text
extracted_body = extract_body(post_body)
if extracted_body:
collated_post['text'] = extracted_body
# yank referenced posts
referenced_posts = extract_references(post_body)
if referenced_posts:
collated_post['referenced_posts'] = referenced_posts
# clean up emails -- see func comment; this is maximum clowntown
collated_post = clean_up_emails(collated_post)
# attach to big list
collected_posts.append(collated_post)
# sort by date asc
collected_posts.sort(key=lambda post: post['post_metadata']['time'])
# pretty print and dump it
# if you're desperate, removing indent=2 shaves a half meg off
keyed_list = {"posts": collected_posts}
print("Dumping YAML")
with open('posts.yml', 'w') as outfile:
yaml.dump(keyed_list, outfile, allow_unicode=True)
print("Dumping JSON")
with open('posts.json', 'w') as outfile:
json.dump(keyed_list, outfile, indent=2, ensure_ascii=False)