-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathenqueue_worker.py
89 lines (68 loc) · 2.16 KB
/
enqueue_worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/python
import sys
import time
import yaml
import rethinkdb as r
import redis
import urlparse
import json
from reppy.cache import RobotsCache
robots = RobotsCache()
parameter_file = open("parameters.yml", "r")
parameters = yaml.load(parameter_file)
rethink = r.connect(parameters['rethinkdb_server']['host'], parameters['rethinkdb_server']['port']).repl()
rethink.use(parameters['rethinkdb_server']['database'])
redis_client = redis.Redis(parameters['redis_server']['host'])
crawl_delay = int(parameters['crawl_delay'])
user_agent = parameters['crawler_headers']['user-agent']
def is_polite(url, user_agent):
allowed = False
try:
allowed = robots.allowed(url, user_agent)
except:
pass
return allowed
def get_delay(url, user_agent):
delay = None
try:
delay = robots.delay(url, user_agent)
except:
pass
if not delay:
delay = crawl_delay
return delay
def main(argv):
# Main code here
print "I'm enqueue worker"
url_queue_table = parameters['rethinkdb_server']['tables']['url_queue']
while True:
url = redis_client.lpop("urls:enqueued")
if url:
if not is_polite(url, user_agent):
print 'Not polite to crawl... back off'
time.sleep(0.3)
continue
delay = get_delay(url, user_agent)
ts = int(time.time() / 60)
urls = urlparse.urlparse(url)
host_name = urls[1]
host_raw = redis_client.get(host_name)
host = json.loads(host_raw) if host_raw else {
'n': 0,
'ts': ts
}
if host['ts'] >= ts:
if host['n'] >= delay:
host['ts'] = host['ts'] + 1
host['n'] = 1
else:
host['n'] = host['n'] + 1
else:
host['ts'] = ts
host['n'] = 0
redis_client.set(host_name, json.dumps(host))
r.table(url_queue_table).insert({'url': url, 'ts': host['ts']}).run(rethink)
else:
time.sleep(0.3)
if __name__ == "__main__":
main(sys.argv)