-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpiderWiki.py
29 lines (27 loc) · 1.08 KB
/
SpiderWiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#coding:utf-8
from Dataoutput import DataOutput
from HtmlDownloader import HtmlDownloader
from HtmlParser import HtmlParser
from URLManager import UrlManager
class SpiderWiki(object):
def __init__(self):
self.manager = UrlManager()
self.downloader = HtmlDownloader()
self.parser = HtmlParser()
self.output = DataOutput()
def crawl(self,root_url):
self.manager.add_new_url(root_url)
while(self.manager.has_new_url() and self.manager.old_url_size()<100):
try:
new_url = self.manager.get_new_url()
html = self.downloader.download(new_url)
new_urls, data = self.parser.parser(new_url, html)
self.manager.add_new_urls(new_urls)
self.output.store_data(data)
print "已经抓取%s个链接"%self.manager.old_url_size()
except Exception, e:
print "crawl failed"
self.output.output_csv()
if __name__=="__main__":
spider_wiki = SpiderWiki()
spider_wiki.crawl("https://en.wikipedia.org/wiki/Genetic_disorder")