-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
82 lines (66 loc) · 2.93 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/python3.6
""" =================================== ~ Python3.6 Webcrawler, 02.08.2018 | 11:48 ~ =================================== """
""" [main.py]: Imports ================================================================================================================ """
import requests as req
from bs4 import BeautifulSoup
import urllib.parse
import json
from colorama import init
init()
from colorama import Fore, Back, Style
""" [main.py]: Main Code ============================================================================================================== """
class Set(set):
def get(self, index):
if index >= len(self):
return None
seti = iter(self)
for i in range(index):
next(seti)
return next(seti)
def meta_info_tag_descriptor(tag):
return tag.get('name') == 'description' or tag.get('name') == 'keywords' or tag.get('name') == 'author' or tag.name == 'title'
def crawl(urls, json_dict):
i = 0
try:
while urls.get(i) != None:
curl = urls.get(i)
print(Fore.LIGHTYELLOW_EX, "[%05d/%05d] Current URL: " % (i+1, len(urls)), curl, Fore.RESET)
try:
html_doc = req.get(curl, timeout=10).text
except Exception:
print(Fore.LIGHTRED_EX + " [-] '" + curl + "' didn't respond ...")
i += 1
continue
soup = BeautifulSoup(html_doc, "html.parser")
descr_str = ''
for descriptor in soup.find_all(meta_info_tag_descriptor):
if len(descriptor.contents) > 0:
if descriptor.contents[0]:
descr_str += ' ' + descriptor.contents[0]
else:
if descriptor.get('content'):
descr_str += ' ' + descriptor.get('content')
json_dict[curl] = descr_str
for a in soup.find_all('a'):
url = str(a.get('href'))
if not (url.startswith('http://') or url.startswith('https://')) and not ('://' in url):
urls.add(urllib.parse.urljoin(curl, url))
else:
urls.add(url)
print(Fore.LIGHTGREEN_EX + "\t[+] Found " + str(len(soup.find_all('a'))) + " URLs ... " + Fore.RESET)
i += 1
except KeyboardInterrupt:
pass
return urls
def main():
urls = Set(['http://links.cncwebsite.com/'])
json_dict = {}
crawl(urls, json_dict)
with open("urls.json", "w") as f:
json.dump(json_dict, f)
with open("urls.dmp", "w") as f:
f.write(str(urls))
""" [main.py]: Not imported =========================================================================================================== """
if __name__ == '__main__':
main()
""" =================================================================================================================================== """