-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathcrawlerQunar.py
102 lines (83 loc) · 4.83 KB
/
crawlerQunar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#! /usr/bin/env python
# -*- coding=utf-8 -*-
from sys import argv
import urllib2
import json
import time
import requests
import pymongo
#connect mydb and access the tour collection
from pymongo import MongoClient
client = MongoClient()
db = client['mydb']
tourcoll = db.tourdom
#crawl travel info from cities s(sa to sb) to cities e(ea to eb)
#start city A's number in st_c list
sa = int(argv[1])
#start city B's number in st_c list
sb = int(argv[2])
#end city A's number in ec_c list
ea = int(argv[3])
#end city B's number in ec_c list
eb = int(argv[4])
#crawled infor list count
list_num = 0
st_c = [u'北京',u'上海',u'广州',u'深圳',u'香港',u'长沙',u'宁波',u'沈阳',u'重庆',u'乌鲁木齐',u'石家庄',u'郑州',u'天津',u'昆明',u'厦门',u'太原',u'无锡',u'兰州',u'苏州',u'常州',u'武汉',u'青岛',u'大连',u'南京',u'张家界',u'贵阳',u'杭州',u'桂林',u'温州',u'南宁',u'三亚',u'南昌',u'成都',u'福州',u'哈尔滨',u'西安',u'济南',u'合肥']
ed_c = [u'三亚',u'海口',u'蜈支洲岛',u'阳朔',u'漓江',u'桂林',u'北海',u'涠洲岛',u'广州',u'深圳',u'上海',u'杭州',u'苏州',u'厦门',u'鼓浪屿',u'青岛',u'武夷山',u'张家界',u'凤凰',u'乌镇',u'绍兴',u'黄山',u'普陀山',u'三清山',u'婺源',u'山东',u'泰山',u'安徽',u'江苏',u'浙江',u'江西',u'九寨沟',u'峨眉山',u'成都',u'重庆',u'昆明',u'丽江',u'西双版纳',u'迪庆',u'大理',u'束河古镇',u'林芝',u'拉萨',u'四川',u'云南',u'西藏',u'西安',u'华山',u'延安',u'敦煌莫高窟',u'青海湖',u'喀纳斯',u'乌鲁木齐',u'银川',u'陕西',u'甘肃',u'宁夏',u'新疆',u'青海',u'北京',u'天津',u'呼伦贝尔',u'五台山',u'平遥',u'太原',u'壶口瀑布',u'大同',u'山西',u'云台山',u'洛阳',u'开封',u'衡山',u'河南',u'武当山',u'三峡大坝旅游区',u'宜昌',u'湖北',u'黑龙江',u'吉林',u'辽宁',u'哈尔滨',u'亚布力',u'太阳岛',u'镜泊湖',u'沈阳',u'大连',u'长白山']
for s in range(sa,sb):
for e in range(ea,eb):
print "Fetching travel info between start city %s (number %d in st_c list) and end city %s (number %d in ed_c list)..." % (st_c[s], s, ed_c[e], e)
#encode st_c/ed_c into url code
sc = urllib2.quote(st_c[s].encode("utf8"))
ec = urllib2.quote(ed_c[e].encode("utf8"))
#construct http header
http_refer = 'http://dujia.qunar.com/pdq/list_' + sc + '_' + ec + '?tm=gn02'
req_header = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22',
'Accept':'application/json, text/javascript, */*',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip,deflate,sdch',
'Host':'dujia.qunar.com',
'Connection':'close',
'Referer':http_refer,
'X-Requested-With':'XMLHttpRequest'
}
req_timeout = 10
#get total page number, give up the last page which doesn't have 20 records
sp_a = 'http://dujia.qunar.com/p/listapi?action=filter&obpop=desc&tm=gn02&dep='
sp_b = '&query='
sp = sp_a + sc + sp_b + ec
req_p = urllib2.Request(sp,None,req_header)
sp_json = urllib2.urlopen(req_p,None,req_timeout)
sp_json_c = sp_json.read()
sp_json_j = json.loads(sp_json_c)
total_page = sp_json_j['data']['types']['all']/20
print "There are totally %d pages\n " % total_page
#limit total pages to no more than 10, you can comment out this limit to get all the data on qunar.com
page_limit = 10
if total_page >= page_limit:
total_page = page_limit
print "limited to %d pages.\n" % (page_limit)
list_num = list_num + total_page
#get total_page records of two cities
for page_num in range(total_page):
se_a = 'http://dujia.qunar.com/p/listapi?action=routeResult&tm=gn02&obpop=desc&toprecommend=0&dep='
se_b = '&query='
se_c = '&pageNo='
page_num_s = str(page_num + 1)
se = se_a + sc +se_b + ec + se_c + page_num_s
r = requests.post(se, stream=True)
r_get = ''
for line in r.iter_lines():
if line:
r_get = r_get + line
#save the data into MongoDB
r_get_dic = json.loads(r_get)
r_get_dic['startcity'] = st_c[s]
r_get_dic['endcity'] = ed_c[e]
tourlistId = tourcoll.insert(r_get_dic)
if tourlistId:
print '>'
else:
print page_num
time.sleep(2)
print "%d lists added in mydb-tourdom \n" % list_num