-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
158 lines (136 loc) · 4.57 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Additional info: https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python
from bs4 import BeautifulSoup as bs
import requests
import argparse
from tqdm import tqdm
import datetime
import statistics
from random import randint
import json
import lxml
import cchardet
from re import sub
from re import match
parser = argparse.ArgumentParser()
parser.add_argument("--verbose", "-v", help="Enable verbose logging to console", action='store_true',
required=False, default=False)
parser.add_argument("--proxy", "-p", help="Use random proxies for each scraping call. MUCH Slower but obfuscates IP Address. Also foregoes HTTPS since not all proxies support it.", action='store_true',
required=False, default=False)
args = parser.parse_args()
baseURL = "https://emojipedia.org/people/"
endURLS = [
"people/",
"nature/",
"food-drink/",
"activity/",
"travel-places/",
"objects/",
"symbols/",
"flags/"
]
urls = ["{}{}"
.format(baseURL, endURL) for endURL in endURLS]
def priceFromListItem(item):
return int(sub(r"\D", "", str(item.contents)))
def getProxyIPs():
r = requests.get("https://www.us-proxy.org/")
soup = bs(r.content, 'html.parser')
table = soup.find('table', attrs={"id": "proxylisttable"})
rows = table.findAll('tr')
proxies = []
for row in rows:
ip = row.findNext('td')
port = ip.findNext('td')
proxy = u"{}:{}".format(ip.contents[0], port.contents[0])
proxies.append(proxy)
proxies.pop()
return proxies
def fetchUnicodeEmojis(withSynonyms=False):
url = ""
if withSynonyms:
url = "https://unicode.org/emoji/charts/emoji-list.html" # Emoji with synonyms
else:
url = "https://unicode.org/emoji/charts/full-emoji-list.html" # Emoji without synonyms
r = None
allEmojis = []
if args.proxy:
proxies = getProxyIPs()
i = randint(0, len(proxies)-1)
r = requests.get(url, proxies={
"http": proxies[i]
})
else:
r = requests.get(url)
if r.status_code == 404:
print("Invalid page, ending!")
return
soup = bs(r.content, 'lxml')
emojiRows = soup.findAll('tr')
headerTitle = ""
print("Found {} rows".format(len(emojiRows)))
for row in tqdm(emojiRows):
cols = row.findAll('td')
numCols = len(cols)
# print(numCols)
if numCols == 0:
headerTitle = "tbd" # cols[0].contents[0]
elif numCols > 1:
if withSynonyms:
img = cols[2].find('img', alt=True)
emoji = img['alt']
name = cols[3].contents[0]
tags = cols[-1].contents[0].split("|")
tags = [s.strip() for s in tags]
else:
emoji = cols[2].contents[0]
name = cols[-1].contents[0]
tags = name.split(" ")
entry = {}
entry['name'] = name
entry['emoji'] = emoji
entry['tags'] = tags
entry['category'] = headerTitle
allEmojis.append(entry)
with open('AllEmojis.txt', 'w+') as outfile:
json.dump(allEmojis, outfile)
def fetchEmojis():
allEmojis = []
requests_session = requests.Session()
for url in tqdm(urls):
r = None
if args.proxy:
proxies = getProxyIPs()
i = randint(0, len(proxies)-1)
r = requests_session.get(url, proxies={
"http": proxies[i]
})
else:
r = requests_session.get(url)
if r.status_code == 404:
print("Invalid page, ending!")
break
soup = bs(r.content, 'html.parser')
emojiList = soup.find('ul', attrs={"class": "emoji-list"})
emojiRows = emojiList.findAll('li')
for row in emojiRows:
span, name = row.findChild('a').contents
emoji = span.contents[0]
entry = {}
entry['name'] = name
entry['emoji'] = emoji
category = match(r"[^/]+(?=/$|$)", url)
entry['category'] = ""
if category:
entry['category'] = category.group(0)
allEmojis.append(entry)
# allEmojis[name] = emoji
if args.verbose:
print(allEmojis)
with open('AllEmojis.txt', 'w+') as outfile:
json.dump(allEmojis, outfile)
if __name__ == "__main__":
fetchUnicodeEmojis(withSynonyms=True)
# fetchEmojis()
# with open('AllEmojis.txt', 'r+') as outfile:
# data = json.load(outfile)
# print(data)