-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
44 lines (38 loc) · 1.26 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import codecs
import json
import justext
import requests
import sys
s = requests.Session()
def get_content(url):
content = ""
try:
r = s.get(url)
if r.status_code == 200: #and r.headers['content-type'] == 'text/html':
paragraphs = justext.justext(r.text, justext.get_stoplist("Spanish"))
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
content += " " + paragraph.text
except Exception as e:
print(e.message)
print(content)
return content
def main(input_file, output_file):
input_url_file = codecs.open(input_file, 'r', 'utf-8')
contents = list()
for url in input_url_file:
url = url.strip()
print("doing %s" % url)
content = get_content(url)
if content:
item = {'url': url, 'content': content}
contents.append(item)
dumped_json = json.dumps(contents)
output_file = codecs.open(output_file, 'w', 'utf-8')
output_file.write(dumped_json)
output_file.close()
if __name__ == "__main__":
path_to_url_file = sys.argv[1]
path_to_output_file = sys.argv[2]
print("reading %s .. outputing to %s" %(path_to_url_file, path_to_output_file) )
main(path_to_url_file, path_to_output_file)