-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawler.py
229 lines (198 loc) · 5.66 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# Assumptions
# Web Page -- ContentType text/html
# Links Contained In --- Anchor,Frame,IFrame Tags
#System Libraries
import sys
import gc
import signal
import logging
from datetime import datetime
#3rd Party Libraries
import requests
from optparse import OptionParser
from urlparse import urlparse
from BeautifulSoup import BeautifulSoup
def GetPage(url):
#Getting Requested Page
try:
REQ = requests.get(url, timeout = 2)
if REQ.status_code==200:
contentType = REQ.headers['Content-Type']
if contentType.startswith('text/html'):
return REQ.text
else:
return -5
elif type(1)==type(REQ.status_code):
return REQ.status_code
else:
logging.error('Date - %s Page Issue - %s',datetime.now(),REQ.status_code)
return -6
except requests.ConnectionError:
return -1
except requests.Timeout:
return -2
except requests.HTTPError:
return -3
except requests.RequestException:
return -4
except Exception as e:
logging.error('Date - %s Exception - %s',datetime.now(),e)
return -6
return 0
def GetTag(html,tag='a'):
#Getting Content Residing Under Given Tags
try:
parsedHtml = BeautifulSoup(html)
VALUES = parsedHtml.findAll(tag)
return VALUES
except Exception as e:
logging.error('Date - %s Exception - %s /n Content - %s /n Content %s',datetime.now(),e,type(html),html)
return None
def GetProperLinks(tag,currentUrl,option='href'):
#Extracting Links From Tags
LINKS = []
for link in tag:
href = link.get(option)
if CheckLinkProtocol(href)==1:
LINKS.append(CheckUrl(href,currentUrl))
return LINKS
def GetAllLinks(html,url):
#Merging Links List Fetched From Anchor,IFrame,Frame Tags
tagLinkSource = [['a','href'],['iframe','src'],['frame','src']]
LINKS = []
for source in tagLinkSource:
LinkTags = GetTag(html,source[0])
if LinkTags != None:
hyperlinks = GetProperLinks(LinkTags,url,source[1])
LINKS += hyperlinks
return LINKS
def LinkCall(internalUrl,hrefs):
#Removing External Links
#By Base Path - Input URL
LINKS = []
for internalLinks in hrefs:
if internalLinks.startswith(internalUrl):
LINKS.append(internalLinks)
return LINKS
def RequestIssue(issue):
#Return Issue Type
if issue == -1:
return 'Network Issue'
elif issue == -2:
return 'TimeOut'
elif issue == -3:
return 'Bad/Invalid HTTP Response'
elif issue == -4:
return 'Something Weird'
elif issue == -5:
return 'Bad Content Type'
elif issue == -6:
return 'Unknown Issue - Check Log'
elif type(1)==type(issue):
return issue
return 'Fetched'
def CheckLinkProtocol(link):
#Checking For UnCrawalable Protocols
if link !=None:
protocols = ['ftp','file','gopher','hdl','imap','mailto','mms','news','nntp','prospero','rsync','rtsp','rtspu','sftp','shttp','sip','sips','snews','svn','svn+ssh','telnet','wais','#']
for protocol in protocols:
if link.startswith(protocol):
return 0
return 1
return 0
def CheckUrl(URL,baseurl):
#Checking URL
#Converting Relative URL to Absolute URL
urlCheck = urlparse(URL)
if not (urlCheck.scheme).startswith('http'):
if urlCheck.netloc == '':
URL = URL.lstrip('/')
URL = baseurl.rstrip('/') + '/' + URL
return URL
URL = 'http://' + URL
return URL
def View(repo,maxcount):
#Displaying Crawled URLS
count = 1
print 'SRNO.\tURL\t\t\t\t'
for elem in repo:
if maxcount >= count:
print "%d.\t%s\t\t\t\t%s"%(count,elem[0],RequestIssue(elem[1]))
count += 1
else:
break
class Crawle:
def __init__(self,values):
#Setting Options Values
self.url = values.url
self.count = values.count
self.ext = values.ext
self.repo = []
def CheckList(self,item):
#Checking Current Crawled URL's Against Repo To Avoid Duplicates
for value in self.repo:
if value[0]==item:
return 0
return 1
def GettingResponse(self,URL):
#Trying To Get URL Content
RESPONSE = GetPage(URL)
if RESPONSE == -2:
RESPONSE = GetPage(URL)
self.AddToRepo(URL,RESPONSE)
return RESPONSE
def AddToRepo(self,URL,response):
#Adding Crawled URL To Repo
value = [URL,response]
self.repo.append(value)
def StartCrawling(self,nextUrl=None):
if nextUrl==None:
nextUrl = self.url
response = self.GettingResponse(nextUrl)
if type(response) != type(1):
self.DoPage(response,nextUrl)
def Stop(self):
#Decides When To Stop Crawling
if self.count==-1:
return 1
elif len(self.repo)<=self.count-1:
return 1
else:
self.CleanUp()
def DoPage(self,html,currentUrl):
#Scan Page For Links
#Start Crawling The Link
if self.Stop()==1:
links = GetAllLinks(html,self.url)
if self.ext == 1:
links = LinkCall(self.url,links)
for link in links:
if self.CheckList(link)==1:
self.StartCrawling(link)
def CleanUp(self,signum=None,frame=None):
#Stops The Script
View(self.repo,self.count)
sys.exit()
if __name__ == '__main__':
parser = OptionParser(usage=__doc__, version="Crawle")
parser.add_option('-u', action='store', dest='url',type='string', default='')
parser.add_option('-c', action='store', dest='count',type='int', default=-1)
parser.add_option('-e', action='store', dest='ext',type='int', default=0)
params, args = parser.parse_args(sys.argv)
#Checking Value
if params.url != '':
if params.count >= -1 and params.count !=0 and params.ext>=0 and params.ext<2:
if not params.url.startswith('http'):
params.url = 'http://' + params.url
gc.enable()
CrawleOb = Crawle(params)
logging.basicConfig(format='%(levelname)s:%(message)s',filename='info.log',level=logging.ERROR)
#Catching Kill/Stop Script Signals
signal.signal(signal.SIGINT, CrawleOb.CleanUp)
signal.signal(signal.SIGTERM, CrawleOb.CleanUp)
CrawleOb.StartCrawling()
View(CrawleOb.repo)
else:
print 'Please Check Your Input'
else:
print 'No Input Given'