-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
147 lines (118 loc) · 4.02 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from bs4 import BeautifulSoup
import requests
import re
import time
import os
import re
import sys
import urllib.parse
DEBUG_MODE = False
if "DEBUG_MODE" in os.environ:
DEBUG_MODE = True
debugprint = print if DEBUG_MODE else lambda *a, **k: None
email_regex = "[A-Za-z0-9]+[\.\-_]?[A-Za-z0-9]+[@]\w+([.]\w{2,8})+"
bad_emails = "|".join([
])
# Print CSV headers
print("Name, Email")
# Set headers
headers = requests.utils.default_headers()
headers.update(
{'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'})
# cookies can be used if sites are behind auth wall
# import http.cookiejar
# cookies = http.cookiejar.MozillaCookieJar('cookies.txt')
# cookies.load()
def get_domain(url):
"""This function gets the domain of a URL using the urllib library"""
a = urllib.parse.urlsplit(url)
return str(a.scheme) + "://" + str(a.hostname)
def get_urls(root):
"""Returns a list of all links to society pages
Parameters:
root (str):Root URL, society homepage.
Returns:
urls (list):List of URLs of all society pages.
"""
urls = []
classes = "|".join(["msl_organisation_list", "view-uclu-societies-directory",
"atoz-container", "listsocieties", "block-og-menu"])
req = requests.get(root, headers) # , cookies=cookies)
soup = BeautifulSoup(req.content, 'html.parser')
main = soup.find(['div', 'ul', 'section'], class_=re.compile(classes))
for a in main.find_all('a', href=True):
url = a['href']
if url.startswith("/"):
urls.append(domain + url)
if url.startswith("https://society.tedu.edu"):
urls.append(url)
urls = list(dict.fromkeys(urls))
return urls
try:
# get url from command line arguments
root = sys.argv[1].strip().strip("\"")
domain = get_domain(root)
except:
print("error in unis.yml file")
# handle edge case for UCL's updated website
if "studentsunionucl" in root:
urls = []
for i in range(16):
urls += get_urls(root + "?page=" + str(i))
time.sleep(0.3)
urls = list(dict.fromkeys(urls))
else:
urls = get_urls(root)
if DEBUG_MODE:
urls = [urls[i] for i in range(10)]
debugprint(urls)
for url in urls:
req = requests.get(url, headers) # , cookies=cookies)
soup = BeautifulSoup(req.content, 'html.parser')
try:
if "cusu.co.uk" in root:
# coventry SU name edge case handling
name = soup.find('h2').find('a').text.strip().lower()
else:
# get name from title
name = soup.find('title').text.strip().lower()
try:
# try to find email address using classes
email = soup.find('a',
class_=re.compile("msl_email|socemail")
)['href'][7:]
if "[email protected]" in email:
# throw error to leave try block
raise ValueError("_")
except:
email = soup.find(string=lambda s:
re.search(email_regex, s) and not
re.search(bad_emails, s) # remove default emails
)
debugprint(email)
reg = re.compile(
"(" + email_regex + ")")
email = str(reg.findall(email)[0][0])
debugprint(email)
# cleanup society name
name = name.replace("&", " and ")
name = name.replace(",", "")
name = name.replace(" ", " ")
name = name.replace(" ", " ")
name = re.sub(" \|.*", '', name)
name = name.strip()
name = name.title()
print(name + ", " + email)
except: # Exception as e:
# print(e)
pass
time.sleep(0.1)