-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHNtop10scraper.py
48 lines (33 loc) · 1.11 KB
/
HNtop10scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#! usr/env/python
#scrapes the top ten links of HN and runs them through
#the alchemy API
import requests
from bs4 import BeautifulSoup
alch_auth = "" #get your own damn key
#takes in BeautifulSoup object
#returns a list of the top ten links on Hacker news
#change count to get more links
def get_top10(hacker_soup):
top_ten = []
count = 0
for link in hacker_soup.find_all("a"):
if "http" in link.get("href"):
top_ten.append(link.get("href"))
count += 1
if count == 11:
top_ten.pop(0)
return top_ten
#takes in a url as a string
#returns a json string of the of the most important words in the article
#makes restful call the Alchemy API
def get_keywords(url):
rank_keywords = "http://access.alchemyapi.com/calls/url/URLGetRankedKeywords?apikey="
alch_r = requests.get(rank_keywords + alch_auth +"&outputMode=json&maxRetrieve=10&url=" + url)
return alch_r.text
#eventually script will add the data to a mongoDB
if __name__ == '__main__':
soup = BeautifulSoup(requests.get("https://news.ycombinator.com/").text)
data = []
for links in get_top10(soup):
data.append(get_keywords(links))
print data