-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQuizletScraper.py
90 lines (75 loc) · 2.86 KB
/
QuizletScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import requests, re
from time import sleep
from bs4 import BeautifulSoup
from termcolor import colored
from random import randint, choice
printing = False
def get_valid_url():
print("\nWhat is the id number or url of the quizlet course?")
qset = input()
url_regex = re.compile(r"^(https://quizlet.com/)?(?P<id>\d{4,15})(/[\w-]{1,1000})?/?$")
id_regex = re.compile(r"^\d{4,15}$")
match = url_regex.search(qset)
if match:
if printing: print("match:", match.group(0), "\n\n")
url = "https://quizlet.com/" + match.group("id")
if printing: print("returning ", url, "\t In get_valid_input()")
return url
else:
if printing: print("match: ", match)
print("That is not a valid id or url. Please enter the string of numbers following \"quizlet.com\" in the url")
sleep(0.3)
print("For example: \"4071437\"")
sleep(0.2)
get_valid_url()
def scrape_data(url):
"""
returns list of many dicts: each contains a term and a definition value
"""
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
# msg_404 = "There is no set with the ID you specified"
msg_404 = "Page Unavailable"
bodyContent = soup.select("div.SetPage-terms")
if msg_404 in str(soup.body):
# check for 404
print("sorry m8, that isn't a valid quizlet course")
else:
if printing: print("YaY!!! real web page. Scraping data....")
# YAY real page! Now let's do the work of extracting the terms and definitions
data = [] # data will be a list containing many dicts with term and definition keys
content = soup.select("div.SetPage-terms")[0]
pairs = content.select("div.SetPageTerm-content")
for pair in pairs:
term = pair.select("span.TermText")[0].get_text()
defn = pair.select("span.TermText")[1].get_text()
data.append({
"term": term,
"definition": defn
})
if printing:
print(colored("term: ", "blue"), term)
print(colored("definition: ", "green"), defn)
if printing: print("\nDONE SCRAPING DATA\n\n")
return data
def prompt_user(data):
pair = choice(data)
if printing: print("pair is ", pair)
t = pair["term"]
d = pair["definition"]
print(d)
ans = input()
if ans.lower() == t.lower():
print(colored("Correct!", "green"))
else:
print(colored("Incorrect.", "red"), " The answer was ", t)
def main():
url = get_valid_url()
if printing: print("Getting data from\t", url)
data = scrape_data(url)
run = True
while run:
prompt_user(data)
print("press q to quit or enter to continue")
if input() == "q": run = False
main()