-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.py
160 lines (143 loc) · 4.93 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python3
import urllib.error
import argparse
import requests
import sqlite3
import m3u8
import time
import re
import os
# Start the timer
start_time = time.time()
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--viewer_token', type=str, help='Optional viewer token', default=None)
args = parser.parse_args()
# Database file path
db_path = 'chosen_links.db'
# Check if the database exists and delete it
if os.path.exists(db_path):
os.remove(db_path)
# Connect to the SQLite database (this will create a new database if it doesn't exist)
conn = sqlite3.connect(db_path)
c = conn.cursor()
# Create table
c.execute('''
CREATE TABLE IF NOT EXISTS links (
url text PRIMARY KEY,
season integer NOT NULL,
episode integer NOT NULL,
duration integer NOT NULL,
title text NOT NULL,
type text NOT NULL,
language text,
resolution text,
bandwidth integer,
average_bandwidth integer
)
''')
def parse_m3u8(url, season, episode, duration, title):
print("Loading URL " + url + " for season " + str(season) + " episode " + str(episode) + "...")
try:
m3u8_obj = m3u8.load(url)
except urllib.error.HTTPError as e:
if e.code == 500 and args.viewer_token:
# Retry with viewer_token if available
tokenized_url = f"{url}?viewerToken={args.viewer_token}"
print(f"Retrying with viewer token: {tokenized_url}")
try:
m3u8_obj = m3u8.load(tokenized_url)
except urllib.error.HTTPError as e_retry:
print(f"Failed to load URL {tokenized_url}: HTTP Error {e_retry.code} {e_retry.reason}")
return # Early return to skip processing this URL
else:
print(f"Failed to load URL {url}: HTTP Error {e.code} {e.reason}")
return # Early return to skip processing this URL
# Parse media URLs
for media in m3u8_obj.media:
c.execute("INSERT OR IGNORE INTO links VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (
media.uri,
season,
episode,
duration,
title,
media.type.lower(),
media.language,
None,
None,
None
))
# Parse playlists
for playlist in m3u8_obj.playlists:
c.execute("INSERT OR IGNORE INTO links VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (
playlist.uri,
season,
episode,
duration,
title,
'video',
None,
str(playlist.stream_info.resolution[1]) + 'p',
playlist.stream_info.bandwidth,
playlist.stream_info.average_bandwidth
))
query = """
query Videos($ChannelID: ID!, $OrderByField: VideoOrderField!, $OrderByDirection: OrderDirection!) {
videos(
ChannelID: $ChannelID
OrderBy: {field: $OrderByField, direction: $OrderByDirection}
) {
edges {
node {
... on Video {
id
title
description
duration
url
tags {
edges {
node {
id
value
}
}
}
}
}
}
}
}
"""
response = requests.post(url='https://api.frontrow.cc/graphql',json={"query": query, "operationName": "Videos", "variables": {
"ChannelID": "12884901895",
"OrderByField": "CREATED_AT",
"OrderByDirection": "ASC"
}})
videos = response.json()['data']['videos']['edges']
def filter_video_for_full_episodes(video):
tags = video['node']['tags']['edges']
return any(map(lambda tag: tag['node']['value'] == 'Full Episode', tags))
full_episodes = sorted(filter(filter_video_for_full_episodes, videos), key=lambda video: video['node']['title'])
# Initialize the counter for independent videos
independent_video_episode = 1
for i, video in enumerate(full_episodes):
#print(f"GraphQL response: ID({video['node']['id']}) TITLE({video['node']['title']}) URL({video['node']['url']}) OBJ({video})")
url = video['node']['url'] or f"https://api.frontrow.cc/channels/12884901895/VIDEO/{video['node']['id']}/hls.m3u8"
match = re.search(r'Season ([0-9]+) Episode ([0-9]+)', video['node']['title'])
if match is not None:
season, episode = map(int, match.groups())
title = re.sub(r'Season ([0-9]+) Episode ([0-9]+): ', '', video['node']['title'])
else:
# If the video is independent, assign it the next episode number
season, episode = 0, independent_video_episode
title = video['node']['title']
# Increment the counter for the next independent video
independent_video_episode += 1
parse_m3u8(url, season, episode, video['node']['duration'], title)
# Commit the changes and close the connection
conn.commit()
conn.close()
# End the timer and print the total runtime
end_time = time.time()
print(f"Execution time: {round(end_time - start_time, 3)} seconds")