-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetch_course.py
52 lines (41 loc) · 1.56 KB
/
fetch_course.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
import os
import ssl
import time
from requests_html import HTMLSession
ssl._create_default_https_context = ssl._create_unverified_context
base_url = 'https://developers.google.com/machine-learning/crash-course/'
def course_info(course_url):
session = HTMLSession()
request = session.get(course_url)
data_video_url = ''
data_captions_url = ''
# video_info = request.html.find('.devsite-vplus', first=True)
# data_video_url = video_info.attrs['data-video-url']
# data_captions_url = video_info.attrs['data-captions-url']
next_url_info = request.html.find('div.devsite-steps-next > a.devsite-steps-link', first=True)
next_url = next_url_info.attrs['href']
return (data_video_url, data_video_url, next_url)
import urllib.request
def getHtml(url):
html = urllib.request.urlopen(url).read()
return html
def saveHtml(file_name, file_content):
dir = 'course_html/'
file_name = file_name.replace('/','_')+'.html'
path = os.path.join(dir, file_name)
with open (path, 'wb') as f:
f.write(file_content)
if __name__ == '__main__':
next_url = 'https://developers.google.com/machine-learning/crash-course/framing/check-your-understanding'
while next_url:
try:
(_, _, next_url) = course_info(next_url)
filename = os.path.basename(next_url)
html = getHtml(next_url)
saveHtml(filename, html)
print(next_url)
except:
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue