-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathHomophone+Scraper.py
122 lines (77 loc) · 2.68 KB
/
Homophone+Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# coding: utf-8
# In[1]:
import sys
import string
import requests
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
# In[2]:
def getMaxPageNumber(bsSource):
# Pagnination keeps track of the page numbers
pagination = bsSource.find('ul', {"class" : "pagination"})
# Pagination does not exist because its only a single page
if not pagination:
return 1
paginationList = pagination.findAll('li')
# Get second to last one because last one is next page
return int(paginationList[-2].text)
# In[3]:
import nltk
# In[4]:
def getAllHomophonesFromLetter(browser, letter):
homophonesPerLetter = []
# There is always a single page for a letter
pageNumber = 1
# Go unitl all of pages have been scraped
while True:
url = "http://www.homophone.com/search?page=" + str(pageNumber) + "&q=" + str(letter)
browser.get(url)
# Default lxml because idc about the parser
bs = BeautifulSoup(browser.page_source, "lxml")
# If its the first page, try to find the max page number
if pageNumber == 1:
maxPageNumber = getMaxPageNumber(bs)
# If its past the last page, stop
if pageNumber == maxPageNumber + 1:
break
# Find all cards
for card in bs.findAll('div', {"class" : "card"}):
cardWords = []
# Find all words in the card
for word in card.findAll('a', {"class" : "btn"}):
sys.stdout.write(word.text + ',')
cardWords.append(word.text)
# Make sure that its non zero
if len(cardWords) != 0:
homophonesPerLetter.append(cardWords)
sys.stdout.write('\n')
# Go to next page
pageNumber = pageNumber + 1
time.sleep(1)
return homophonesPerLetter
# Create browser and query for each letter of the alphabet
# Returns a list of homohpones for each letter of the alphabet
def getHomophonesFromWebpage():
homophones = []
hom = []
path = r'C:/Users/Aravind/Downloads/chromedriver.exe'
browser = webdriver.Chrome(executable_path=path)
# Go through every letter of the alphabet
for letter in list(string.ascii_lowercase):
homophones.extend(getAllHomophonesFromLetter(browser, letter))
hom.append(getAllHomophonesFromLetter(browser,letter))
# Delay so we are not mean to their servers
time.sleep(1)
browser.close()
return homophones
# Print out in JSON form
def main():
print (json.dumps(getHomophonesFromWebpage()))
return 0
# Call main
if __name__ == "__main__":
sys.exit(main())
# In[ ]:
# In[ ]: