-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
88 lines (75 loc) · 2.48 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from lxml import html
from stop_words import stop_words
import requests
import re
import io
urls = [
"https://en.m.wikipedia.org/wiki/List_of_S%26P_500_companies",
"https://en.m.wikipedia.org/wiki/Dow_Jones_Industrial_Average",
"https://en.m.wikipedia.org/wiki/Nikkei_225",
"https://en.m.wikipedia.org/wiki/List_of_largest_Internet_companies",
"https://en.wikipedia.org/wiki/List_of_most_popular_websites"
]
fileName = [
"sp500",
"dow",
"nikei",
"internet",
"websites"
]
urlXpath = [
'//table[@id="constituents"]/tbody/tr/td[position()=1]/a/text()',
'//table[@id="constituents"]/tbody/tr/td[position()=1]/a/text()',
'//div[contains(@class, "mf-section-3")]/div/ul/li/a[1]/text()',
'//div[contains(@class, "mf-section-2")]/table/tbody/tr/td[2]/a/text()',
'//div[@id="mw-content-text"]/div/table/tbody/tr/td[1]/a/text()'
]
def getCompanies():
print("Getting companies")
allCompanies = []
for i in range(len(urls)):
companies = []
page = requests.get(urls[i])
tree = html.fromstring(page.content)
companies = tree.xpath(urlXpath[i])
allCompanies += companies
compileOutput(companies, fileName[i], 'true')
compileOutput(companies, fileName[i], 'false')
return allCompanies
def compileOutput(companies, fileName, removeSw):
print("Compiling output")
output = 'companies = ['
for index in range(len(companies)):
output += '\n"'
if removeSw == 'true':
output += removeStopWords(companies[index])
else:
output += companies[index].lower()
if index != (len(companies) - 1):
output += '",'
else:
output += '"'
output += "\n]"
if removeSw == 'true':
fileName += '_no_stop_words'
writeCompaniesToFile(output, fileName)
def removeStopWords(word):
word = word.lower()
for stop_word in stop_words:
word = re.sub(r"\b%s\b|\B\.\B" % stop_word, '', word)
word = word.replace(' ', ' ')
word = word.strip()
return word
def writeCompaniesToFile(output, fileName):
print("Writing companies to file")
tmpFileName = fileName + '_list.py'
file = open(tmpFileName, 'w', encoding='utf-8')
file.write(output)
return file.close()
def main():
companies = getCompanies()
companies = list(set(companies))
compileOutput(companies, 'companies', 'false')
compileOutput(companies, 'companies', 'true')
if __name__ == '__main__':
main()