This repository has been archived by the owner on Sep 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdhqa_data.py
executable file
·208 lines (167 loc) · 6.87 KB
/
dhqa_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python
'''
Script to parse data DH Q&A archive and create a dataset of posts.
Install python dependencies:
pip install beautifulsoup4 feedparser requests
Clone DH Q&A archive repository:
https://github.com/achorg/DH-Answers-Archive
Run this script in the top-level directory of the repository.
'''
import csv
import datetime
import glob
import os
import re
from bs4 import BeautifulSoup, Comment
import feedparser
import requests
baseurl = 'http://digitalhumanities.org/answers'
def get_post_info(div, topic_url, feed, page_url=None):
# take a post container and return dict of post info
# takes bs4 div, base url for this topic, feedparser object,
# and optional page url where it differs from topic url
# (i.e. posts on page 2 of a topic)
info = {}
# page url is different from topic url for posts on page 2
if page_url is None:
page_url = topic_url
# generate permalink from li id since in at least one
# case the permalink isn't found
info['url'] = '%s#%s' % (page_url, div['id'])
# first div id includes order information as position-#
info['order'] = div.div['id'].split('-')[1]
threadauthor = div.find('div', class_='threadauthor')
author_url = threadauthor.a['href']
# members have local profile urls
if author_url.startswith('/'):
author_url = '%s%s' % (baseurl, author_url)
info['author_url'] = author_url
info['author'] = threadauthor.find('strong').get_text()
# question is in first threadpost
threadpost = div.find('div', class_='threadpost')
# remove 'tweet this question' block and related comments
social = threadpost.find('div', class_="social-it")
if social:
social.extract()
[comment.extract() for comment in threadpost.findAll(
text=lambda text:isinstance(text, Comment))]
# get post html content
info['html_content'] = threadpost.div.prettify()
# extract text from post on html page
info['content'] = threadpost.div.get_text()
# check if this is a reply to a specific post
if threadpost.p and threadpost.p.get_text().startswith('Replying to'):
# name could be a link, so get last link in the reply p
reply_to_post = threadpost.p.find_all('a')[-1]['href']
info['reply_to'] = '%s%s' % (baseurl, reply_to_post)
# check if marked as a best answer
info['is_best_answer'] = bool(post.find('div', class_='best_answer'))
# post date
poststuff = div.find('div', class_='poststuff')
if poststuff:
relative_post_date = poststuff.text
# Posted x years ago
relative_post_date = relative_post_date.replace('Posted ', '') \
.replace(' Permalink', '')
info['relative_date'] = relative_post_date.strip()
else:
print('poststuff div not found for %(url)s' % info)
# find RSS entry for this record if possible
if feed:
entries = [e for e in feed.entries if e.link == info['url']]
if entries:
entry = entries[0]
# convert parsed timestruct into isoformat
info['date'] = datetime.datetime(*entry.published_parsed[:6]) \
.isoformat()
else:
print('ERROR: not in feed %s' % info['url'])
return info
def wayback_machine_timestamp(url):
'''get timestamp for most recent capture of a url from wayback
machine api'''
response = requests.get('http://archive.org/wayback/available',
params={'url': url})
if response.status_code == requests.codes.ok:
data = response.json()
# if archived snapshots is not empty, return closest timestamp
if data['archived_snapshots']:
return data['archived_snapshots']['closest']['timestamp']
dhqa_posts = []
post_fieldnames = [
'url',
'topic_url',
'question',
'tags',
'author',
'author_url',
'html_content',
'content',
'date',
'relative_date',
'snapshot_date',
'order',
'is_best_answer',
'reply_to',
]
for path in glob.glob('topic/*/index.html'):
# topic meta should include url for topic,
# but is not completely reliable!
# generate from filename instead
topic_url = '%s/%s' % (baseurl, os.path.dirname(path))
capture_date = wayback_machine_timestamp(topic_url)
topic_data = {
'topic_url': topic_url,
'snapshot_date': capture_date or ''
}
with open(path) as topicdoc:
soup = BeautifulSoup(topicdoc, 'html.parser')
# page title is question (summary/brief)
topic_data['question'] = soup.find('h2').get_text()
tags = soup.find_all('a', rel='tag')
topic_data['tags'] = ';'.join([t.get_text() for t in tags])
# should tags apply to all posts or just question?
# html doesn't have a proper date but RSS should
# get rss filename from rss link
rss = soup.find('a', class_="rss-link")['href'].lstrip('/')
if os.path.exists(rss):
# with open(rss) as rssdoc:
feed = feedparser.parse(rss)
# rss_soup = BeautifulSoup(rssdoc, 'lxml')
# items = rss_soup.findAll('item')
if not feed.entries:
print('ERROR: RSS file has no content: %s' % rss)
feed = None
else:
print('ERROR: Missing RSS file: %s' % rss)
feed = None
posts = soup.findAll('li', id=re.compile(r'^post-\d+'))
for post in posts:
post_data = get_post_info(post, topic_url, feed)
post_data.update(topic_data)
dhqa_posts.append(post_data)
# check for second page (few cases; nothing has more than 2 pages)
next_link = soup.find('a', class_='next')
if next_link:
page_two = '%s/index.html' % next_link['href'].lstrip('/')
# post permalink and RSS links are relative to the page
page_url = 'http://digitalhumanities.org/answers%s' % \
next_link['href']
# page two capture date could be different
capture_date = wayback_machine_timestamp(page_url)
topic_data['snapshot_date'] = capture_date or ''
with open(page_two) as page_two_doc:
soup2 = BeautifulSoup(page_two_doc, 'html.parser')
posts = soup2.findAll('li', id=re.compile(r'^post-\d+'))
for post in posts:
post_data = get_post_info(post, topic_url, feed,
page_url=page_url)
post_data.update(topic_data)
dhqa_posts.append(post_data)
# NOTE: missing 11 topic RSS feeds
# may be able to get date from tag feeds
print('%d posts total' % len(dhqa_posts))
with open('dhqa_data.csv', 'w') as outfile:
writer = csv.DictWriter(outfile, fieldnames=post_fieldnames)
writer.writeheader()
writer.writerows(dhqa_posts)