From 784c94e439bf9c1726c59afd22a031a177ceca24 Mon Sep 17 00:00:00 2001 From: ifsvivek Date: Sat, 26 Aug 2023 21:46:26 +0530 Subject: [PATCH] Fixed incompatibility --- plugin/manifest.json | 2 +- plugin/yt-dlp/yt_dlp/YoutubeDL.py | 187 +- plugin/yt-dlp/yt_dlp/__init__.py | 32 +- plugin/yt-dlp/yt_dlp/casefold.py | 5 + .../yt-dlp/yt_dlp/compat/urllib/__init__.py | 7 + plugin/yt-dlp/yt_dlp/compat/urllib/request.py | 40 + plugin/yt-dlp/yt_dlp/cookies.py | 339 +- plugin/yt-dlp/yt_dlp/downloader/__init__.py | 3 +- plugin/yt-dlp/yt_dlp/downloader/common.py | 35 +- plugin/yt-dlp/yt_dlp/downloader/external.py | 4 +- plugin/yt-dlp/yt_dlp/downloader/fragment.py | 40 +- plugin/yt-dlp/yt_dlp/downloader/http.py | 7 +- plugin/yt-dlp/yt_dlp/downloader/niconico.py | 101 +- plugin/yt-dlp/yt_dlp/extractor/_extractors.py | 83 +- plugin/yt-dlp/yt_dlp/extractor/acast.py | 34 +- plugin/yt-dlp/yt_dlp/extractor/aenetworks.py | 15 +- plugin/yt-dlp/yt_dlp/extractor/afreecatv.py | 103 +- plugin/yt-dlp/yt_dlp/extractor/amp.py | 9 +- plugin/yt-dlp/yt_dlp/extractor/anvato.py | 10 +- plugin/yt-dlp/yt_dlp/extractor/ard.py | 44 +- plugin/yt-dlp/yt_dlp/extractor/bibeltv.py | 202 +- plugin/yt-dlp/yt_dlp/extractor/bilibili.py | 57 +- plugin/yt-dlp/yt_dlp/extractor/booyah.py | 86 - plugin/yt-dlp/yt_dlp/extractor/bravotv.py | 8 +- plugin/yt-dlp/yt_dlp/extractor/camfm.py | 85 + plugin/yt-dlp/yt_dlp/extractor/canvas.py | 383 - plugin/yt-dlp/yt_dlp/extractor/cbc.py | 6 +- plugin/yt-dlp/yt_dlp/extractor/cbslocal.py | 116 - plugin/yt-dlp/yt_dlp/extractor/cbsnews.py | 380 +- plugin/yt-dlp/yt_dlp/extractor/ciscowebex.py | 4 +- .../yt-dlp/yt_dlp/extractor/comedycentral.py | 5 +- plugin/yt-dlp/yt_dlp/extractor/common.py | 71 +- plugin/yt-dlp/yt_dlp/extractor/crtvg.py | 34 + plugin/yt-dlp/yt_dlp/extractor/crunchyroll.py | 662 +- plugin/yt-dlp/yt_dlp/extractor/dacast.py | 158 + plugin/yt-dlp/yt_dlp/extractor/daftsex.py | 27 +- .../yt_dlp/extractor/digitalconcerthall.py | 27 +- plugin/yt-dlp/yt_dlp/extractor/discogs.py | 35 + plugin/yt-dlp/yt_dlp/extractor/dropout.py | 54 +- plugin/yt-dlp/yt_dlp/extractor/dumpert.py | 49 +- .../yt-dlp/yt_dlp/extractor/elevensports.py | 59 + plugin/yt-dlp/yt_dlp/extractor/ettutv.py | 60 + plugin/yt-dlp/yt_dlp/extractor/europa.py | 80 +- plugin/yt-dlp/yt_dlp/extractor/eurosport.py | 28 +- plugin/yt-dlp/yt_dlp/extractor/foxnews.py | 77 +- plugin/yt-dlp/yt_dlp/extractor/funker530.py | 79 + plugin/yt-dlp/yt_dlp/extractor/hotstar.py | 40 +- plugin/yt-dlp/yt_dlp/extractor/idolplus.py | 115 + plugin/yt-dlp/yt_dlp/extractor/iwara.py | 149 +- plugin/yt-dlp/yt_dlp/extractor/jstream.py | 73 + plugin/yt-dlp/yt_dlp/extractor/ketnet.py | 70 - plugin/yt-dlp/yt_dlp/extractor/lbry.py | 129 +- plugin/yt-dlp/yt_dlp/extractor/litv.py | 4 +- plugin/yt-dlp/yt_dlp/extractor/livestream.py | 92 +- plugin/yt-dlp/yt_dlp/extractor/mgtv.py | 65 +- plugin/yt-dlp/yt_dlp/extractor/mzaalo.py | 92 + plugin/yt-dlp/yt_dlp/extractor/naver.py | 2 +- plugin/yt-dlp/yt_dlp/extractor/nebula.py | 36 +- plugin/yt-dlp/yt_dlp/extractor/nekohacker.py | 217 + plugin/yt-dlp/yt_dlp/extractor/nhk.py | 111 +- plugin/yt-dlp/yt_dlp/extractor/niconico.py | 163 + .../yt-dlp/yt_dlp/extractor/odnoklassniki.py | 54 +- plugin/yt-dlp/yt_dlp/extractor/owncloud.py | 80 + plugin/yt-dlp/yt_dlp/extractor/piksel.py | 16 +- plugin/yt-dlp/yt_dlp/extractor/playsuisse.py | 88 +- .../yt-dlp/yt_dlp/extractor/polskieradio.py | 208 +- plugin/yt-dlp/yt_dlp/extractor/rai.py | 565 +- plugin/yt-dlp/yt_dlp/extractor/recurbate.py | 43 + .../yt-dlp/yt_dlp/extractor/rottentomatoes.py | 80 +- plugin/yt-dlp/yt_dlp/extractor/rozhlas.py | 164 +- plugin/yt-dlp/yt_dlp/extractor/rumble.py | 2 +- plugin/yt-dlp/yt_dlp/extractor/shemaroome.py | 5 +- plugin/yt-dlp/yt_dlp/extractor/sonyliv.py | 6 +- plugin/yt-dlp/yt_dlp/extractor/stripchat.py | 3 +- plugin/yt-dlp/yt_dlp/extractor/substack.py | 8 +- .../yt-dlp/yt_dlp/extractor/sverigesradio.py | 62 +- plugin/yt-dlp/yt_dlp/extractor/tagesschau.py | 58 +- plugin/yt-dlp/yt_dlp/extractor/tencent.py | 14 +- plugin/yt-dlp/yt_dlp/extractor/tiktok.py | 38 +- plugin/yt-dlp/yt_dlp/extractor/tv4.py | 77 +- plugin/yt-dlp/yt_dlp/extractor/tvp.py | 4 +- plugin/yt-dlp/yt_dlp/extractor/tvplay.py | 213 +- plugin/yt-dlp/yt_dlp/extractor/twitch.py | 17 +- plugin/yt-dlp/yt_dlp/extractor/twitter.py | 216 +- plugin/yt-dlp/yt_dlp/extractor/unsupported.py | 5 +- plugin/yt-dlp/yt_dlp/extractor/urplay.py | 25 +- plugin/yt-dlp/yt_dlp/extractor/vidio.py | 2 +- plugin/yt-dlp/yt_dlp/extractor/voot.py | 183 +- plugin/yt-dlp/yt_dlp/extractor/vrt.py | 413 +- plugin/yt-dlp/yt_dlp/extractor/weverse.py | 607 + plugin/yt-dlp/yt_dlp/extractor/weyyak.py | 86 + .../yt_dlp/extractor/wrestleuniverse.py | 2 +- plugin/yt-dlp/yt_dlp/extractor/wykop.py | 268 + plugin/yt-dlp/yt_dlp/extractor/ximalaya.py | 2 +- plugin/yt-dlp/yt_dlp/extractor/yappy.py | 30 +- plugin/yt-dlp/yt_dlp/extractor/youtube.py | 370 +- plugin/yt-dlp/yt_dlp/extractor/zaiko.py | 130 + plugin/yt-dlp/yt_dlp/extractor/zdf.py | 5 +- plugin/yt-dlp/yt_dlp/extractor/zee5.py | 26 +- plugin/yt-dlp/yt_dlp/extractor/zingmp3.py | 101 +- plugin/yt-dlp/yt_dlp/jsinterp.py | 21 +- plugin/yt-dlp/yt_dlp/options.py | 60 +- plugin/yt-dlp/yt_dlp/postprocessor/common.py | 2 +- plugin/yt-dlp/yt_dlp/update.py | 49 +- plugin/yt-dlp/yt_dlp/utils/__init__.py | 14 + plugin/yt-dlp/yt_dlp/utils/_deprecated.py | 30 + plugin/yt-dlp/yt_dlp/utils/_legacy.py | 180 + .../yt_dlp/{utils.py => utils/_utils.py} | 12499 ++++++++-------- plugin/yt-dlp/yt_dlp/utils/traversal.py | 254 + plugin/yt-dlp/yt_dlp/version.py | 4 +- 110 files changed, 13556 insertions(+), 9123 deletions(-) create mode 100644 plugin/yt-dlp/yt_dlp/casefold.py create mode 100644 plugin/yt-dlp/yt_dlp/compat/urllib/__init__.py create mode 100644 plugin/yt-dlp/yt_dlp/compat/urllib/request.py delete mode 100644 plugin/yt-dlp/yt_dlp/extractor/booyah.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/camfm.py delete mode 100644 plugin/yt-dlp/yt_dlp/extractor/canvas.py delete mode 100644 plugin/yt-dlp/yt_dlp/extractor/cbslocal.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/crtvg.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/dacast.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/discogs.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/elevensports.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/ettutv.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/funker530.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/idolplus.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/jstream.py delete mode 100644 plugin/yt-dlp/yt_dlp/extractor/ketnet.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/mzaalo.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/nekohacker.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/owncloud.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/recurbate.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/weverse.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/weyyak.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/wykop.py create mode 100644 plugin/yt-dlp/yt_dlp/extractor/zaiko.py create mode 100644 plugin/yt-dlp/yt_dlp/utils/__init__.py create mode 100644 plugin/yt-dlp/yt_dlp/utils/_deprecated.py create mode 100644 plugin/yt-dlp/yt_dlp/utils/_legacy.py rename plugin/yt-dlp/yt_dlp/{utils.py => utils/_utils.py} (85%) create mode 100644 plugin/yt-dlp/yt_dlp/utils/traversal.py diff --git a/plugin/manifest.json b/plugin/manifest.json index 48bef98..706655a 100644 --- a/plugin/manifest.json +++ b/plugin/manifest.json @@ -2,7 +2,7 @@ "uuid": "videofetch", "name": "VideoFetch", "description": "A plugin for Free Download Manager to easily download YouTube videos.", - "version": "1.0.1", + "version": "1.0.2", "icon": "icon.svg", "mediaParser": true, "mediaListParser": true, diff --git a/plugin/yt-dlp/yt_dlp/YoutubeDL.py b/plugin/yt-dlp/yt_dlp/YoutubeDL.py index 5b39cfd..a6b57df 100644 --- a/plugin/yt-dlp/yt_dlp/YoutubeDL.py +++ b/plugin/yt-dlp/yt_dlp/YoutubeDL.py @@ -13,6 +13,7 @@ import random import re import shutil +import string import subprocess import sys import tempfile @@ -20,10 +21,9 @@ import tokenize import traceback import unicodedata -import urllib.request -from string import Formatter, ascii_letters from .cache import Cache +from .compat import urllib # isort: split from .compat import compat_os_name, compat_shlex_quote from .cookies import load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name @@ -124,7 +124,6 @@ parse_filesize, preferredencoding, prepend_extension, - register_socks_protocols, remove_terminal_sequences, render_table, replace_extension, @@ -190,6 +189,8 @@ class YoutubeDL: ap_username: Multiple-system operator account username. ap_password: Multiple-system operator account password. usenetrc: Use netrc for authentication instead. + netrc_location: Location of the netrc file. Defaults to ~/.netrc. + netrc_cmd: Use a shell command to get credentials verbose: Print additional info to stdout. quiet: Do not print messages to stdout. no_warnings: Do not print out anything for warnings. @@ -258,7 +259,7 @@ class YoutubeDL: consoletitle: Display progress in console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file - clean_infojson: Remove private fields from the infojson + clean_infojson: Remove internal metadata from the infojson getcomments: Extract video comments. This will not be written to disk unless writeinfojson is also given writeannotations: Write the video annotations to a .annotations.xml file @@ -280,7 +281,7 @@ class YoutubeDL: subtitles. The language can be prefixed with a "-" to exclude it from the requested languages, e.g. ['all', '-live_chat'] keepvideo: Keep the video file after post-processing - daterange: A DateRange object, download only if the upload_date is in the range. + daterange: A utils.DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file cachedir: Location of the cache files in the filesystem. False to disable filesystem cache. @@ -329,13 +330,13 @@ class YoutubeDL: 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. extract_flat: Whether to resolve and process url_results further - * False: Always process (default) + * False: Always process. Default for API * True: Never process * 'in_playlist': Do not process inside playlist/multi_video * 'discard': Always process, but don't return the result from inside playlist/multi_video * 'discard_in_playlist': Same as "discard", but only for - playlists (not multi_video) + playlists (not multi_video). Default for CLI wait_for_video: If given, wait for scheduled streams to become available. The value should be a tuple containing the range (min_secs, max_secs) to wait between retries @@ -415,7 +416,12 @@ class YoutubeDL: - Raise utils.DownloadCancelled(msg) to abort remaining downloads when a video is rejected. match_filter_func in utils.py is one example for this. - no_color: Do not emit color codes in output. + color: A Dictionary with output stream names as keys + and their respective color policy as values. + Can also just be a single color policy, + in which case it applies to all outputs. + Valid stream names are 'stdout' and 'stderr'. + Valid color policies are one of 'always', 'auto', 'no_color' or 'never'. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For HTTP header geo_bypass_country: @@ -472,7 +478,7 @@ class YoutubeDL: can also be used The following options are used by the extractors: - extractor_retries: Number of times to retry for known errors + extractor_retries: Number of times to retry for known errors (default: 3) dynamic_mpd: Whether to process dynamic DASH manifests (default: True) hls_split_discontinuity: Split HLS playlists to different formats at discontinuities such as ad breaks (default: False) @@ -537,6 +543,7 @@ class YoutubeDL: data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about HLS. (only for youtube) + no_color: Same as `color='no_color'` """ _NUMERIC_FIELDS = { @@ -603,9 +610,24 @@ def __init__(self, params=None, auto_init=True): except Exception as e: self.write_debug(f'Failed to enable VT mode: {e}') + if self.params.get('no_color'): + if self.params.get('color') is not None: + self.report_warning('Overwriting params from "color" with "no_color"') + self.params['color'] = 'no_color' + + term_allow_color = os.environ.get('TERM', '').lower() != 'dumb' + + def process_color_policy(stream): + stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream] + policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False) + if policy in ('auto', None): + return term_allow_color and supports_terminal_sequences(stream) + assert policy in ('always', 'never', 'no_color') + return {'always': True, 'never': False}.get(policy, policy) + self._allow_colors = Namespace(**{ - type_: not self.params.get('no_color') and supports_terminal_sequences(stream) - for type_, stream in self._out_files.items_ if type_ != 'console' + name: process_color_policy(stream) + for name, stream in self._out_files.items_ if name != 'console' }) # The code is left like this to be reused for future deprecations @@ -738,7 +760,6 @@ def check_deprecated(param, option, suggestion): when=when) self._setup_opener() - register_socks_protocols() def preload_download_archive(fn): """Preload the archive, if any is specified""" @@ -975,7 +996,7 @@ def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_enc text = text.encode(encoding, 'ignore').decode(encoding) if fallback is not None and text != original_text: text = fallback - return format_text(text, f) if allow_colors else text if fallback is None else fallback + return format_text(text, f) if allow_colors is True else text if fallback is None else fallback def _format_out(self, *args, **kwargs): return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs) @@ -1078,7 +1099,7 @@ def _outtmpl_expandpath(outtmpl): # correspondingly that is not what we want since we need to keep # '%%' intact for template dict substitution step. Working around # with boundary-alike separator hack. - sep = ''.join(random.choices(ascii_letters, k=32)) + sep = ''.join(random.choices(string.ascii_letters, k=32)) outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') # outtmpl should be expand_path'ed before template dict substitution @@ -1237,7 +1258,7 @@ def _dumpjson_default(obj): return list(obj) return repr(obj) - class _ReplacementFormatter(Formatter): + class _ReplacementFormatter(string.Formatter): def get_field(self, field_name, args, kwargs): if field_name.isdigit(): return args[0], -1 @@ -1266,17 +1287,17 @@ def create_key(outer_mobj): if fmt == 's' and value is not None and key in field_size_compat_map.keys(): fmt = f'0{field_size_compat_map[key]:d}d' - if value is None: - value = default - elif replacement is not None: + if None not in (value, replacement): try: value = replacement_formatter.format(replacement, value) except ValueError: - value = na + value, default = None, na flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' - if fmt[-1] == 'l': # list + if value is None: + value, fmt = default, 's' + elif fmt[-1] == 'l': # list delim = '\n' if '#' in flags else ', ' value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt elif fmt[-1] == 'j': # json @@ -1307,17 +1328,19 @@ def create_key(outer_mobj): value = str(value)[0] else: fmt = str_fmt - elif fmt[-1] not in 'rs': # numeric + elif fmt[-1] not in 'rsa': # numeric value = float_or_none(value) if value is None: value, fmt = default, 's' if sanitize: + # If value is an object, sanitize might convert it to a string + # So we convert it to repr first if fmt[-1] == 'r': - # If value is an object, sanitize might convert it to a string - # So we convert it to repr first value, fmt = repr(value), str_fmt - if fmt[-1] in 'csr': + elif fmt[-1] == 'a': + value, fmt = ascii(value), str_fmt + if fmt[-1] in 'csra': value = sanitizer(initial_field, value) key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) @@ -1383,7 +1406,7 @@ def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False): def _match_entry(self, info_dict, incomplete=False, silent=False): """Returns None if the file should be downloaded""" - _type = info_dict.get('_type', 'video') + _type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video') assert incomplete or _type == 'video', 'Only video result can be considered complete' video_title = info_dict.get('title', info_dict.get('id', 'entry')) @@ -1881,7 +1904,7 @@ def __process_playlist(self, ie_result, download): continue entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') - if not lazy and 'playlist-index' in self.params.get('compat_opts', []): + if not lazy and 'playlist-index' in self.params['compat_opts']: playlist_index = ie_result['requested_entries'][i] entry_copy = collections.ChainMap(entry, { @@ -2067,86 +2090,86 @@ def syntax_error(note, start): def _parse_filter(tokens): filter_parts = [] - for type, string, start, _, _ in tokens: - if type == tokenize.OP and string == ']': + for type, string_, start, _, _ in tokens: + if type == tokenize.OP and string_ == ']': return ''.join(filter_parts) else: - filter_parts.append(string) + filter_parts.append(string_) def _remove_unused_ops(tokens): # Remove operators that we don't use and join them with the surrounding strings. # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None - for type, string, start, end, line in tokens: - if type == tokenize.OP and string == '[': + for type, string_, start, end, line in tokens: + if type == tokenize.OP and string_ == '[': if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None - yield type, string, start, end, line + yield type, string_, start, end, line # everything inside brackets will be handled by _parse_filter - for type, string, start, end, line in tokens: - yield type, string, start, end, line - if type == tokenize.OP and string == ']': + for type, string_, start, end, line in tokens: + yield type, string_, start, end, line + if type == tokenize.OP and string_ == ']': break - elif type == tokenize.OP and string in ALLOWED_OPS: + elif type == tokenize.OP and string_ in ALLOWED_OPS: if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None - yield type, string, start, end, line + yield type, string_, start, end, line elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: if not last_string: - last_string = string + last_string = string_ last_start = start last_end = end else: - last_string += string + last_string += string_ if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None - for type, string, start, _, _ in tokens: + for type, string_, start, _, _ in tokens: # ENCODING is only defined in python 3.x if type == getattr(tokenize, 'ENCODING', None): continue elif type in [tokenize.NAME, tokenize.NUMBER]: - current_selector = FormatSelector(SINGLE, string, []) + current_selector = FormatSelector(SINGLE, string_, []) elif type == tokenize.OP: - if string == ')': + if string_ == ')': if not inside_group: # ')' will be handled by the parentheses group tokens.restore_last_token() break - elif inside_merge and string in ['/', ',']: + elif inside_merge and string_ in ['/', ',']: tokens.restore_last_token() break - elif inside_choice and string == ',': + elif inside_choice and string_ == ',': tokens.restore_last_token() break - elif string == ',': + elif string_ == ',': if not current_selector: raise syntax_error('"," must follow a format selector', start) selectors.append(current_selector) current_selector = None - elif string == '/': + elif string_ == '/': if not current_selector: raise syntax_error('"/" must follow a format selector', start) first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) - elif string == '[': + elif string_ == '[': if not current_selector: current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) - elif string == '(': + elif string_ == '(': if current_selector: raise syntax_error('Unexpected "("', start) group = _parse_format_selection(tokens, inside_group=True) current_selector = FormatSelector(GROUP, group, []) - elif string == '+': + elif string_ == '+': if not current_selector: raise syntax_error('Unexpected "+"', start) selector_1 = current_selector @@ -2155,7 +2178,7 @@ def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, ins raise syntax_error('Expected a selector', start) current_selector = FormatSelector(MERGE, (selector_1, selector_2), []) else: - raise syntax_error(f'Operator not recognized: "{string}"', start) + raise syntax_error(f'Operator not recognized: "{string_}"', start) elif type == tokenize.ENDMARKER: break if current_selector: @@ -2381,8 +2404,10 @@ def restore_last_token(self): def _calc_headers(self, info_dict): res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) - - cookies = self._calc_cookies(info_dict['url']) + if 'Youtubedl-No-Compression' in res: # deprecated + res.pop('Youtubedl-No-Compression', None) + res['Accept-Encoding'] = 'identity' + cookies = self.cookiejar.get_cookie_header(info_dict['url']) if cookies: res['Cookie'] = cookies @@ -2394,9 +2419,8 @@ def _calc_headers(self, info_dict): return res def _calc_cookies(self, url): - pr = sanitized_Request(url) - self.cookiejar.add_cookie_header(pr) - return pr.get_header('Cookie') + self.deprecation_warning('"YoutubeDL._calc_cookies" is deprecated and may be removed in a future version') + return self.cookiejar.get_cookie_header(url) def _sort_thumbnails(self, thumbnails): thumbnails.sort(key=lambda t: ( @@ -2723,21 +2747,22 @@ def is_wellformed(f): return info_dict format_selector = self.format_selector - if format_selector is None: - req_format = self._default_format_spec(info_dict, download=download) - self.write_debug('Default format spec: %s' % req_format) - format_selector = self.build_format_selector(req_format) - while True: if interactive_format_selection: - req_format = input( - self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS)) + req_format = input(self._format_screen('\nEnter format selector ', self.Styles.EMPHASIS) + + '(Press ENTER for default, or Ctrl+C to quit)' + + self._format_screen(': ', self.Styles.EMPHASIS)) try: - format_selector = self.build_format_selector(req_format) + format_selector = self.build_format_selector(req_format) if req_format else None except SyntaxError as err: self.report_error(err, tb=False, is_error=False) continue + if format_selector is None: + req_format = self._default_format_spec(info_dict, download=download) + self.write_debug(f'Default format spec: {req_format}') + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector({ 'formats': formats, 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), @@ -2897,7 +2922,7 @@ def format_tmpl(tmpl): fmt = '%({})s' if tmpl.startswith('{'): - tmpl = f'.{tmpl}' + tmpl, fmt = f'.{tmpl}', '%({})j' if tmpl.endswith('='): tmpl, fmt = tmpl[:-1], '{0} = %({0})#j' return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(','))) @@ -2936,7 +2961,7 @@ def print_field(field, actual_field=None, optional=False): print_field('url', 'urls') print_field('thumbnail', optional=True) print_field('description', optional=True) - print_field('filename', optional=True) + print_field('filename') if self.params.get('forceduration') and info_copy.get('duration') is not None: self.to_stdout(formatSeconds(info_copy['duration'])) print_field('format') @@ -3161,7 +3186,6 @@ def existing_video_file(*filepaths): return if info_dict.get('requested_formats') is not None: - requested_formats = info_dict['requested_formats'] old_ext = info_dict['ext'] if self.params.get('merge_output_format') is None: if (info_dict['ext'] == 'webm' @@ -3188,19 +3212,22 @@ def correct_ext(filename, ext=new_ext): full_filename = correct_ext(full_filename) temp_filename = correct_ext(temp_filename) dl_filename = existing_video_file(full_filename, temp_filename) + info_dict['__real_download'] = False + # NOTE: Copy so that original format dicts are not modified + info_dict['requested_formats'] = list(map(dict, info_dict['requested_formats'])) merger = FFmpegMergerPP(self) downloaded = [] if dl_filename is not None: self.report_file_already_downloaded(dl_filename) elif fd: - for f in requested_formats if fd != FFmpegFD else []: + for f in info_dict['requested_formats'] if fd != FFmpegFD else []: f['filepath'] = fname = prepend_extension( correct_ext(temp_filename, info_dict['ext']), 'f%s' % f['format_id'], info_dict['ext']) downloaded.append(fname) - info_dict['url'] = '\n'.join(f['url'] for f in requested_formats) + info_dict['url'] = '\n'.join(f['url'] for f in info_dict['requested_formats']) success, real_download = self.dl(temp_filename, info_dict) info_dict['__real_download'] = real_download else: @@ -3224,7 +3251,7 @@ def correct_ext(filename, ext=new_ext): f'You have requested downloading multiple formats to stdout {reason}. ' 'The formats will be streamed one after the other') fname = temp_filename - for f in requested_formats: + for f in info_dict['requested_formats']: new_info = dict(info_dict) del new_info['requested_formats'] new_info.update(f) @@ -3418,8 +3445,8 @@ def sanitize_info(info_dict, remove_private_keys=False): if remove_private_keys: reject = lambda k, v: v is None or k.startswith('__') or k in { 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', - 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber', - '_format_sort_fields', + 'entries', 'filepath', '_filename', 'filename', 'infojson_filename', 'original_url', + 'playlist_autonumber', '_format_sort_fields', } else: reject = lambda k, v: False @@ -3488,7 +3515,7 @@ def run_pp(self, pp, infodict): *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)') return infodict - def run_all_pps(self, key, info, *, additional_pps=None, fatal=True): + def run_all_pps(self, key, info, *, additional_pps=None): if key != 'video': self._forceprint(key, info) for pp in (additional_pps or []) + self._pps[key]: @@ -3767,9 +3794,14 @@ def print_debug_header(self): def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) + additional_info = [] + if os.environ.get('TERM', '').lower() == 'dumb': + additional_info.append('dumb') if not supports_terminal_sequences(stream): from .utils import WINDOWS_VT_MODE # Must be imported locally - ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' + additional_info.append('No VT' if WINDOWS_VT_MODE is False else 'No ANSI') + if additional_info: + ret = f'{ret} ({",".join(additional_info)})' return ret encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % ( @@ -3994,7 +4026,7 @@ def _write_subtitles(self, info_dict, filename): # that way it will silently go on when used with unsupporting IE return ret elif not subtitles: - self.to_screen('[info] There\'s no subtitles for the requested languages') + self.to_screen('[info] There are no subtitles for the requested languages') return ret sub_filename_base = self.prepare_filename(info_dict, 'subtitle') if not sub_filename_base: @@ -4048,7 +4080,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None if write_all or self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') or [] if not thumbnails: - self.to_screen(f'[info] There\'s no {label} thumbnails to download') + self.to_screen(f'[info] There are no {label} thumbnails to download') return ret multiple = write_all and len(thumbnails) > 1 @@ -4080,8 +4112,11 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename except network_exceptions as err: + if isinstance(err, urllib.error.HTTPError) and err.code == 404: + self.to_screen(f'[info] {thumb_display_id.title()} does not exist') + else: + self.report_warning(f'Unable to download {thumb_display_id}: {err}') thumbnails.pop(idx) - self.report_warning(f'Unable to download {thumb_display_id}: {err}') if ret and not write_all: break return ret diff --git a/plugin/yt-dlp/yt_dlp/__init__.py b/plugin/yt-dlp/yt_dlp/__init__.py index 345d774..9f4cf62 100644 --- a/plugin/yt-dlp/yt_dlp/__init__.py +++ b/plugin/yt-dlp/yt_dlp/__init__.py @@ -13,6 +13,7 @@ import os import re import sys +import traceback from .compat import compat_shlex_quote from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS @@ -187,8 +188,8 @@ def validate_minmax(min_val, max_val, min_name, max_name=None): raise ValueError(f'{max_name} "{max_val}" must be must be greater than or equal to {min_name} "{min_val}"') # Usernames and passwords - validate(not opts.usenetrc or (opts.username is None and opts.password is None), - '.netrc', msg='using {name} conflicts with giving username/password') + validate(sum(map(bool, (opts.usenetrc, opts.netrc_cmd, opts.username))) <= 1, '.netrc', + msg='{name}, netrc command and username/password are mutually exclusive options') validate(opts.password is None or opts.username is not None, 'account username', msg='{name} missing') validate(opts.ap_password is None or opts.ap_username is not None, 'TV Provider account username', msg='{name} missing') @@ -435,6 +436,10 @@ def metadataparser_actions(f): elif ed and proto == 'default': default_downloader = ed.get_basename() + for policy in opts.color.values(): + if policy not in ('always', 'auto', 'no_color', 'never'): + raise ValueError(f'"{policy}" is not a valid color policy') + warnings, deprecation_warnings = [], [] # Common mistake: -f best @@ -736,6 +741,7 @@ def parse_options(argv=None): return ParsedOptions(parser, opts, urls, { 'usenetrc': opts.usenetrc, 'netrc_location': opts.netrc_location, + 'netrc_cmd': opts.netrc_cmd, 'username': opts.username, 'password': opts.password, 'twofactor': opts.twofactor, @@ -893,7 +899,7 @@ def parse_options(argv=None): 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, 'match_filter': opts.match_filter, - 'no_color': opts.no_color, + 'color': opts.color, 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, 'hls_use_mpegts': opts.hls_use_mpegts, @@ -937,14 +943,18 @@ def _real_main(argv=None): if opts.rm_cachedir: ydl.cache.remove() - updater = Updater(ydl, opts.update_self if isinstance(opts.update_self, str) else None) - if opts.update_self and updater.update() and actual_use: - if updater.cmd: - return updater.restart() - # This code is reachable only for zip variant in py < 3.10 - # It makes sense to exit here, but the old behavior is to continue - ydl.report_warning('Restart yt-dlp to use the updated version') - # return 100, 'ERROR: The program must exit for the update to complete' + try: + updater = Updater(ydl, opts.update_self) + if opts.update_self and updater.update() and actual_use: + if updater.cmd: + return updater.restart() + # This code is reachable only for zip variant in py < 3.10 + # It makes sense to exit here, but the old behavior is to continue + ydl.report_warning('Restart yt-dlp to use the updated version') + # return 100, 'ERROR: The program must exit for the update to complete' + except Exception: + traceback.print_exc() + ydl._download_retcode = 100 if not actual_use: if pre_process: diff --git a/plugin/yt-dlp/yt_dlp/casefold.py b/plugin/yt-dlp/yt_dlp/casefold.py new file mode 100644 index 0000000..41a53e5 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/casefold.py @@ -0,0 +1,5 @@ +import warnings + +warnings.warn(DeprecationWarning(f'{__name__} is deprecated')) + +casefold = str.casefold diff --git a/plugin/yt-dlp/yt_dlp/compat/urllib/__init__.py b/plugin/yt-dlp/yt_dlp/compat/urllib/__init__.py new file mode 100644 index 0000000..6b6b8e1 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/compat/urllib/__init__.py @@ -0,0 +1,7 @@ +# flake8: noqa: F405 +from urllib import * # noqa: F403 + +from ..compat_utils import passthrough_module + +passthrough_module(__name__, 'urllib') +del passthrough_module diff --git a/plugin/yt-dlp/yt_dlp/compat/urllib/request.py b/plugin/yt-dlp/yt_dlp/compat/urllib/request.py new file mode 100644 index 0000000..ff63b2f --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/compat/urllib/request.py @@ -0,0 +1,40 @@ +# flake8: noqa: F405 +from urllib.request import * # noqa: F403 + +from ..compat_utils import passthrough_module + +passthrough_module(__name__, 'urllib.request') +del passthrough_module + + +from .. import compat_os_name + +if compat_os_name == 'nt': + # On older python versions, proxies are extracted from Windows registry erroneously. [1] + # If the https proxy in the registry does not have a scheme, urllib will incorrectly add https:// to it. [2] + # It is unlikely that the user has actually set it to be https, so we should be fine to safely downgrade + # it to http on these older python versions to avoid issues + # This also applies for ftp proxy type, as ftp:// proxy scheme is not supported. + # 1: https://github.com/python/cpython/issues/86793 + # 2: https://github.com/python/cpython/blob/51f1ae5ceb0673316c4e4b0175384e892e33cc6e/Lib/urllib/request.py#L2683-L2698 + import sys + from urllib.request import getproxies_environment, getproxies_registry + + def getproxies_registry_patched(): + proxies = getproxies_registry() + if ( + sys.version_info >= (3, 10, 5) # https://docs.python.org/3.10/whatsnew/changelog.html#python-3-10-5-final + or (3, 9, 13) <= sys.version_info < (3, 10) # https://docs.python.org/3.9/whatsnew/changelog.html#python-3-9-13-final + ): + return proxies + + for scheme in ('https', 'ftp'): + if scheme in proxies and proxies[scheme].startswith(f'{scheme}://'): + proxies[scheme] = 'http' + proxies[scheme][len(scheme):] + + return proxies + + def getproxies(): + return getproxies_environment() or getproxies_registry_patched() + +del compat_os_name diff --git a/plugin/yt-dlp/yt_dlp/cookies.py b/plugin/yt-dlp/yt_dlp/cookies.py index 756cda9..a9daa2a 100644 --- a/plugin/yt-dlp/yt_dlp/cookies.py +++ b/plugin/yt-dlp/yt_dlp/cookies.py @@ -1,7 +1,9 @@ import base64 +import collections import contextlib import http.cookiejar import http.cookies +import io import json import os import re @@ -11,6 +13,7 @@ import sys import tempfile import time +import urllib.request from datetime import datetime, timedelta, timezone from enum import Enum, auto from hashlib import pbkdf2_hmac @@ -29,11 +32,14 @@ from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( Popen, - YoutubeDLCookieJar, error_to_str, + escape_url, expand_path, is_path_like, + sanitize_url, + str_or_none, try_call, + write_string, ) CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} @@ -347,7 +353,9 @@ class ChromeCookieDecryptor: Linux: - cookies are either v10 or v11 - v10: AES-CBC encrypted with a fixed key + - also attempts empty password if decryption fails - v11: AES-CBC encrypted with an OS protected key (keyring) + - also attempts empty password if decryption fails - v11 keys can be stored in various places depending on the activate desktop environment [2] Mac: @@ -362,7 +370,7 @@ class ChromeCookieDecryptor: Sources: - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/ - - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_linux.cc + - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_linux.cc - KeyStorageLinux::CreateService """ @@ -384,6 +392,7 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): def __init__(self, browser_keyring_name, logger, *, keyring=None): self._logger = logger self._v10_key = self.derive_key(b'peanuts') + self._empty_key = self.derive_key(b'') self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0} self._browser_keyring_name = browser_keyring_name self._keyring = keyring @@ -396,25 +405,36 @@ def _v11_key(self): @staticmethod def derive_key(password): # values from - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_linux.cc return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) def decrypt(self, encrypted_value): + """ + + following the same approach as the fix in [1]: if cookies fail to decrypt then attempt to decrypt + with an empty password. The failure detection is not the same as what chromium uses so the + results won't be perfect + + References: + - [1] https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/ + - a bugfix to try an empty password as a fallback + """ version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': self._cookie_counts['v10'] += 1 - return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key, self._empty_key), self._logger) elif version == b'v11': self._cookie_counts['v11'] += 1 if self._v11_key is None: self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True) return None - return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v11_key, self._empty_key), self._logger) else: + self._logger.warning(f'unknown cookie version: "{version}"', only_once=True) self._cookie_counts['other'] += 1 return None @@ -429,7 +449,7 @@ def __init__(self, browser_keyring_name, logger): @staticmethod def derive_key(password): # values from - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) def decrypt(self, encrypted_value): @@ -442,12 +462,12 @@ def decrypt(self, encrypted_value): self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None - return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key,), self._logger) else: self._cookie_counts['other'] += 1 # other prefixes are considered 'old data' which were stored as plaintext - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm return encrypted_value @@ -467,7 +487,7 @@ def decrypt(self, encrypted_value): self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc # kNonceLength nonce_length = 96 // 8 # boringssl @@ -484,23 +504,27 @@ def decrypt(self, encrypted_value): else: self._cookie_counts['other'] += 1 # any other prefix means the data is DPAPI encrypted - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc return _decrypt_windows_dpapi(encrypted_value, self._logger).decode() def _extract_safari_cookies(profile, logger): - if profile is not None: - logger.error('safari does not support profiles') if sys.platform != 'darwin': raise ValueError(f'unsupported platform: {sys.platform}') - cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') + if profile: + cookies_path = os.path.expanduser(profile) + if not os.path.isfile(cookies_path): + raise FileNotFoundError('custom safari cookies database not found') + + else: + cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') - if not os.path.isfile(cookies_path): - logger.debug('Trying secondary cookie location') - cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies') if not os.path.isfile(cookies_path): - raise FileNotFoundError('could not find safari cookies database') + logger.debug('Trying secondary cookie location') + cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies') + if not os.path.isfile(cookies_path): + raise FileNotFoundError('could not find safari cookies database') with open(cookies_path, 'rb') as f: cookies_data = f.read() @@ -663,19 +687,27 @@ class _LinuxDesktopEnvironment(Enum): """ OTHER = auto() CINNAMON = auto() + DEEPIN = auto() GNOME = auto() - KDE = auto() + KDE3 = auto() + KDE4 = auto() + KDE5 = auto() + KDE6 = auto() PANTHEON = auto() + UKUI = auto() UNITY = auto() XFCE = auto() + LXQT = auto() class _LinuxKeyring(Enum): """ - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.h + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.h SelectedLinuxBackend """ - KWALLET = auto() + KWALLET = auto() # KDE4 + KWALLET5 = auto() + KWALLET6 = auto() GNOMEKEYRING = auto() BASICTEXT = auto() @@ -683,7 +715,7 @@ class _LinuxKeyring(Enum): SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys() -def _get_linux_desktop_environment(env): +def _get_linux_desktop_environment(env, logger): """ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc GetDesktopEnvironment @@ -698,51 +730,97 @@ def _get_linux_desktop_environment(env): return _LinuxDesktopEnvironment.GNOME else: return _LinuxDesktopEnvironment.UNITY + elif xdg_current_desktop == 'Deepin': + return _LinuxDesktopEnvironment.DEEPIN elif xdg_current_desktop == 'GNOME': return _LinuxDesktopEnvironment.GNOME elif xdg_current_desktop == 'X-Cinnamon': return _LinuxDesktopEnvironment.CINNAMON elif xdg_current_desktop == 'KDE': - return _LinuxDesktopEnvironment.KDE + kde_version = env.get('KDE_SESSION_VERSION', None) + if kde_version == '5': + return _LinuxDesktopEnvironment.KDE5 + elif kde_version == '6': + return _LinuxDesktopEnvironment.KDE6 + elif kde_version == '4': + return _LinuxDesktopEnvironment.KDE4 + else: + logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4') + return _LinuxDesktopEnvironment.KDE4 elif xdg_current_desktop == 'Pantheon': return _LinuxDesktopEnvironment.PANTHEON elif xdg_current_desktop == 'XFCE': return _LinuxDesktopEnvironment.XFCE + elif xdg_current_desktop == 'UKUI': + return _LinuxDesktopEnvironment.UKUI + elif xdg_current_desktop == 'LXQt': + return _LinuxDesktopEnvironment.LXQT + else: + logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') + elif desktop_session is not None: - if desktop_session in ('mate', 'gnome'): + if desktop_session == 'deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif desktop_session in ('mate', 'gnome'): return _LinuxDesktopEnvironment.GNOME - elif 'kde' in desktop_session: - return _LinuxDesktopEnvironment.KDE - elif 'xfce' in desktop_session: + elif desktop_session in ('kde4', 'kde-plasma'): + return _LinuxDesktopEnvironment.KDE4 + elif desktop_session == 'kde': + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 + elif 'xfce' in desktop_session or desktop_session == 'xubuntu': return _LinuxDesktopEnvironment.XFCE + elif desktop_session == 'ukui': + return _LinuxDesktopEnvironment.UKUI + else: + logger.info(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') + else: if 'GNOME_DESKTOP_SESSION_ID' in env: return _LinuxDesktopEnvironment.GNOME elif 'KDE_FULL_SESSION' in env: - return _LinuxDesktopEnvironment.KDE + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 return _LinuxDesktopEnvironment.OTHER def _choose_linux_keyring(logger): """ - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.cc - SelectBackend + SelectBackend in [1] + + There is currently support for forcing chromium to use BASIC_TEXT by creating a file called + `Disable Local Encryption` [1] in the user data dir. The function to write this file (`WriteBackendUse()` [1]) + does not appear to be called anywhere other than in tests, so the user would have to create this file manually + and so would be aware enough to tell yt-dlp to use the BASIC_TEXT keyring. + + References: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.cc """ - desktop_environment = _get_linux_desktop_environment(os.environ) + desktop_environment = _get_linux_desktop_environment(os.environ, logger) logger.debug(f'detected desktop environment: {desktop_environment.name}') - if desktop_environment == _LinuxDesktopEnvironment.KDE: + if desktop_environment == _LinuxDesktopEnvironment.KDE4: linux_keyring = _LinuxKeyring.KWALLET - elif desktop_environment == _LinuxDesktopEnvironment.OTHER: + elif desktop_environment == _LinuxDesktopEnvironment.KDE5: + linux_keyring = _LinuxKeyring.KWALLET5 + elif desktop_environment == _LinuxDesktopEnvironment.KDE6: + linux_keyring = _LinuxKeyring.KWALLET6 + elif desktop_environment in ( + _LinuxDesktopEnvironment.KDE3, _LinuxDesktopEnvironment.LXQT, _LinuxDesktopEnvironment.OTHER + ): linux_keyring = _LinuxKeyring.BASICTEXT else: linux_keyring = _LinuxKeyring.GNOMEKEYRING return linux_keyring -def _get_kwallet_network_wallet(logger): +def _get_kwallet_network_wallet(keyring, logger): """ The name of the wallet used to store network passwords. - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/kwallet_dbus.cc + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/kwallet_dbus.cc KWalletDBus::NetworkWallet which does a dbus call to the following function: https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html @@ -750,10 +828,22 @@ def _get_kwallet_network_wallet(logger): """ default_wallet = 'kdewallet' try: + if keyring == _LinuxKeyring.KWALLET: + service_name = 'org.kde.kwalletd' + wallet_path = '/modules/kwalletd' + elif keyring == _LinuxKeyring.KWALLET5: + service_name = 'org.kde.kwalletd5' + wallet_path = '/modules/kwalletd5' + elif keyring == _LinuxKeyring.KWALLET6: + service_name = 'org.kde.kwalletd6' + wallet_path = '/modules/kwalletd6' + else: + raise ValueError(keyring) + stdout, _, returncode = Popen.run([ 'dbus-send', '--session', '--print-reply=literal', - '--dest=org.kde.kwalletd5', - '/modules/kwalletd5', + f'--dest={service_name}', + wallet_path, 'org.kde.KWallet.networkWallet' ], text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) @@ -768,8 +858,8 @@ def _get_kwallet_network_wallet(logger): return default_wallet -def _get_kwallet_password(browser_keyring_name, logger): - logger.debug('using kwallet-query to obtain password from kwallet') +def _get_kwallet_password(browser_keyring_name, keyring, logger): + logger.debug(f'using kwallet-query to obtain password from {keyring.name}') if shutil.which('kwallet-query') is None: logger.error('kwallet-query command not found. KWallet and kwallet-query ' @@ -777,7 +867,7 @@ def _get_kwallet_password(browser_keyring_name, logger): 'included in the kwallet package for your distribution') return b'' - network_wallet = _get_kwallet_network_wallet(logger) + network_wallet = _get_kwallet_network_wallet(keyring, logger) try: stdout, _, returncode = Popen.run([ @@ -799,8 +889,9 @@ def _get_kwallet_password(browser_keyring_name, logger): # checks hasEntry. To verify this: # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" # while starting chrome. - # this may be a bug as the intended behaviour is to generate a random password and store - # it, but that doesn't matter here. + # this was identified as a bug later and fixed in + # https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/#F0 + # https://chromium.googlesource.com/chromium/src/+/5463af3c39d7f5b6d11db7fbd51e38cc1974d764 return b'' else: logger.debug('password found') @@ -838,8 +929,8 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger): keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger) logger.debug(f'Chosen keyring: {keyring.name}') - if keyring == _LinuxKeyring.KWALLET: - return _get_kwallet_password(browser_keyring_name, logger) + if keyring in (_LinuxKeyring.KWALLET, _LinuxKeyring.KWALLET5, _LinuxKeyring.KWALLET6): + return _get_kwallet_password(browser_keyring_name, keyring, logger) elif keyring == _LinuxKeyring.GNOMEKEYRING: return _get_gnome_keyring_password(browser_keyring_name, logger) elif keyring == _LinuxKeyring.BASICTEXT: @@ -867,6 +958,10 @@ def _get_mac_keyring_password(browser_keyring_name, logger): def _get_windows_v10_key(browser_root, logger): + """ + References: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc + """ path = _find_most_recently_used_file(browser_root, 'Local State', logger) if path is None: logger.error('could not find local state file') @@ -875,11 +970,13 @@ def _get_windows_v10_key(browser_root, logger): with open(path, encoding='utf8') as f: data = json.load(f) try: + # kOsCryptEncryptedKeyPrefName in [1] base64_key = data['os_crypt']['encrypted_key'] except KeyError: logger.error('no encrypted key in Local State') return None encrypted_key = base64.b64decode(base64_key) + # kDPAPIKeyPrefix in [1] prefix = b'DPAPI' if not encrypted_key.startswith(prefix): logger.error('invalid key') @@ -891,13 +988,15 @@ def pbkdf2_sha1(password, salt, iterations, key_length): return pbkdf2_hmac('sha1', password, salt, iterations, key_length) -def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16): - plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) - try: - return plaintext.decode() - except UnicodeDecodeError: - logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) - return None +def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16): + for key in keys: + plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) + try: + return plaintext.decode() + except UnicodeDecodeError: + pass + logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) + return None def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): @@ -1091,3 +1190,143 @@ def load(self, data): else: morsel = None + + +class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ + _HTTPONLY_PREFIX = '#HttpOnly_' + _ENTRY_LEN = 7 + _HEADER = '''# Netscape HTTP Cookie File +# This file is generated by yt-dlp. Do not edit. + +''' + _CookieFileEntry = collections.namedtuple( + 'CookieFileEntry', + ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) + + def __init__(self, filename=None, *args, **kwargs): + super().__init__(None, *args, **kwargs) + if is_path_like(filename): + filename = os.fspath(filename) + self.filename = filename + + @staticmethod + def _true_or_false(cndn): + return 'TRUE' if cndn else 'FALSE' + + @contextlib.contextmanager + def open(self, file, *, write=False): + if is_path_like(file): + with open(file, 'w' if write else 'r', encoding='utf-8') as f: + yield f + else: + if write: + file.truncate(0) + yield file + + def _really_save(self, f, ignore_discard=False, ignore_expires=False): + now = time.time() + for cookie in self: + if (not ignore_discard and cookie.discard + or not ignore_expires and cookie.is_expired(now)): + continue + name, value = cookie.name, cookie.value + if value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name, value = '', name + f.write('%s\n' % '\t'.join(( + cookie.domain, + self._true_or_false(cookie.domain.startswith('.')), + cookie.path, + self._true_or_false(cookie.secure), + str_or_none(cookie.expires, default=''), + name, value + ))) + + def save(self, filename=None, *args, **kwargs): + """ + Save cookies to a file. + Code is taken from CPython 3.6 + https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ + + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + # Store session cookies with `expires` set to 0 instead of an empty string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + + with self.open(filename, write=True) as f: + f.write(self._HEADER) + self._really_save(f, *args, **kwargs) + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + def prepare_line(line): + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + # comments and empty lines are fine + if line.startswith('#') or not line.strip(): + return line + cookie_list = line.split('\t') + if len(cookie_list) != self._ENTRY_LEN: + raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) + cookie = self._CookieFileEntry(*cookie_list) + if cookie.expires_at and not cookie.expires_at.isdigit(): + raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) + return line + + cf = io.StringIO() + with self.open(filename) as f: + for line in f: + try: + cf.write(prepare_line(line)) + except http.cookiejar.LoadError as e: + if f'{line.strip()} '[0] in '[{"': + raise http.cookiejar.LoadError( + 'Cookies file must be Netscape formatted, not JSON. See ' + 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') + write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') + continue + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + def get_cookie_header(self, url): + """Generate a Cookie HTTP header for a given url""" + cookie_req = urllib.request.Request(escape_url(sanitize_url(url))) + self.add_cookie_header(cookie_req) + return cookie_req.get_header('Cookie') + + def clear(self, *args, **kwargs): + with contextlib.suppress(KeyError): + return super().clear(*args, **kwargs) diff --git a/plugin/yt-dlp/yt_dlp/downloader/__init__.py b/plugin/yt-dlp/yt_dlp/downloader/__init__.py index f12b920..18ac43d 100644 --- a/plugin/yt-dlp/yt_dlp/downloader/__init__.py +++ b/plugin/yt-dlp/yt_dlp/downloader/__init__.py @@ -30,7 +30,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N from .http import HttpFD from .ism import IsmFD from .mhtml import MhtmlFD -from .niconico import NiconicoDmcFD +from .niconico import NiconicoDmcFD, NiconicoLiveFD from .rtmp import RtmpFD from .rtsp import RtspFD from .websocket import WebSocketFragmentFD @@ -50,6 +50,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N 'ism': IsmFD, 'mhtml': MhtmlFD, 'niconico_dmc': NiconicoDmcFD, + 'niconico_live': NiconicoLiveFD, 'fc2_live': FC2LiveFD, 'websocket_frag': WebSocketFragmentFD, 'youtube_live_chat': YoutubeLiveChatFD, diff --git a/plugin/yt-dlp/yt_dlp/downloader/common.py b/plugin/yt-dlp/yt_dlp/downloader/common.py index 38fba10..773b08a 100644 --- a/plugin/yt-dlp/yt_dlp/downloader/common.py +++ b/plugin/yt-dlp/yt_dlp/downloader/common.py @@ -49,10 +49,10 @@ class FileDownloader: verbose: Print additional info to stdout. quiet: Do not print messages to stdout. ratelimit: Download speed limit, in bytes/sec. - continuedl: Attempt to continue downloads if possible throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) - retries: Number of times to retry for HTTP error 5xx - file_access_retries: Number of times to retry on file access error + retries: Number of times to retry for expected network errors. + Default is 0 for API, but 10 for CLI + file_access_retries: Number of times to retry on file access error (default: 3) buffersize: Size of download buffer in bytes. noresizebuffer: Do not automatically resize the download buffer. continuedl: Try to continue downloads if possible. @@ -138,17 +138,21 @@ def calc_percent(byte_counter, data_len): def format_percent(percent): return ' N/A%' if percent is None else f'{percent:>5.1f}%' - @staticmethod - def calc_eta(start, now, total, current): + @classmethod + def calc_eta(cls, start_or_rate, now_or_remaining, total=NO_DEFAULT, current=NO_DEFAULT): + if total is NO_DEFAULT: + rate, remaining = start_or_rate, now_or_remaining + if None in (rate, remaining): + return None + return int(float(remaining) / rate) + + start, now = start_or_rate, now_or_remaining if total is None: return None if now is None: now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) + rate = cls.calc_speed(start, now, current) + return rate and int((float(total) - float(current)) / rate) @staticmethod def calc_speed(start, now, bytes): @@ -165,6 +169,12 @@ def format_speed(speed): def format_retries(retries): return 'inf' if retries == float('inf') else int(retries) + @staticmethod + def filesize_or_none(unencoded_filename): + if os.path.isfile(unencoded_filename): + return os.path.getsize(unencoded_filename) + return 0 + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) @@ -225,7 +235,7 @@ def error_callback(err, count, retries, *, fd): sleep_func=fd.params.get('retry_sleep_functions', {}).get('file_access')) def wrapper(self, func, *args, **kwargs): - for retry in RetryManager(self.params.get('file_access_retries'), error_callback, fd=self): + for retry in RetryManager(self.params.get('file_access_retries', 3), error_callback, fd=self): try: return func(self, *args, **kwargs) except OSError as err: @@ -285,7 +295,8 @@ def _prepare_multiline_status(self, lines=1): self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines) else: self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet')) - self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') + self._multiline.allow_colors = self.ydl._allow_colors.out and self.ydl._allow_colors.out != 'no_color' + self._multiline._HAVE_FULLCAP = self.ydl._allow_colors.out def _finish_multiline_status(self): self._multiline.end() diff --git a/plugin/yt-dlp/yt_dlp/downloader/external.py b/plugin/yt-dlp/yt_dlp/downloader/external.py index ef1e1f5..5ce7af3 100644 --- a/plugin/yt-dlp/yt_dlp/downloader/external.py +++ b/plugin/yt-dlp/yt_dlp/downloader/external.py @@ -23,7 +23,6 @@ encodeArgument, encodeFilename, find_available_port, - handle_youtubedl_headers, remove_end, sanitized_Request, traverse_obj, @@ -529,10 +528,9 @@ def _call_downloader(self, tmpfilename, info_dict): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): if fmt.get('http_headers') and re.match(r'^https?://', fmt['url']): - headers_dict = handle_youtubedl_headers(fmt['http_headers']) # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in headers_dict.items())]) + args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in fmt['http_headers'].items())]) if start_time: args += ['-ss', str(start_time)] diff --git a/plugin/yt-dlp/yt_dlp/downloader/fragment.py b/plugin/yt-dlp/yt_dlp/downloader/fragment.py index 61f1bc8..ec400c5 100644 --- a/plugin/yt-dlp/yt_dlp/downloader/fragment.py +++ b/plugin/yt-dlp/yt_dlp/downloader/fragment.py @@ -34,8 +34,8 @@ class FragmentFD(FileDownloader): Available options: - fragment_retries: Number of times to retry a fragment for HTTP error (DASH - and hlsnative only) + fragment_retries: Number of times to retry a fragment for HTTP error + (DASH and hlsnative only). Default is 0 for API, but 10 for CLI skip_unavailable_fragments: Skip unavailable fragments (DASH and hlsnative only) keep_fragments: Keep downloaded fragments on disk after downloading is @@ -121,6 +121,11 @@ def _download_fragment(self, ctx, frag_url, info_dict, headers=None, request_dat 'request_data': request_data, 'ctx_id': ctx.get('ctx_id'), } + frag_resume_len = 0 + if ctx['dl'].params.get('continuedl', True): + frag_resume_len = self.filesize_or_none(self.temp_name(fragment_filename)) + fragment_info_dict['frag_resume_len'] = ctx['frag_resume_len'] = frag_resume_len + success, _ = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False @@ -155,9 +160,7 @@ def _append_fragment(self, ctx, frag_content): del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: + if not ctx.setdefault('live', False): total_frags_str = '%d' % ctx['total_frags'] ad_frags = ctx.get('ad_frags', 0) if ad_frags: @@ -170,15 +173,17 @@ def _prepare_frag_download(self, ctx): **self.params, 'noprogress': True, 'test': False, + 'sleep_interval': 0, + 'max_sleep_interval': 0, + 'sleep_interval_subtitles': 0, }) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' - resume_len = 0 # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): + resume_len = self.filesize_or_none(tmpfilename) + if resume_len > 0: open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) # Should be initialized before ytdl file check ctx.update({ @@ -187,7 +192,9 @@ def _prepare_frag_download(self, ctx): }) if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + continuedl = self.params.get('continuedl', True) + if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) is_corrupt = ctx.get('ytdl_corrupt') is True is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 @@ -201,7 +208,12 @@ def _prepare_frag_download(self, ctx): if 'ytdl_corrupt' in ctx: del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) + else: + if not continuedl: + if ytdl_file_exists: + self._read_ytdl_file(ctx) + ctx['fragment_index'] = resume_len = 0 self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 @@ -274,12 +286,10 @@ def frag_progress_hook(s): else: frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size - resume_len, - state['downloaded_bytes'] - resume_len) ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes) + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0)) + if not ctx['live']: + state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state, info_dict) @@ -297,7 +307,7 @@ def _finish_frag_download(self, ctx, info_dict): to_file = ctx['tmpfilename'] != '-' if to_file: - downloaded_bytes = os.path.getsize(encodeFilename(ctx['tmpfilename'])) + downloaded_bytes = self.filesize_or_none(ctx['tmpfilename']) else: downloaded_bytes = ctx['complete_frags_downloaded_bytes'] diff --git a/plugin/yt-dlp/yt_dlp/downloader/http.py b/plugin/yt-dlp/yt_dlp/downloader/http.py index 3cb5825..6fbbb4b 100644 --- a/plugin/yt-dlp/yt_dlp/downloader/http.py +++ b/plugin/yt-dlp/yt_dlp/downloader/http.py @@ -45,8 +45,8 @@ class DownloadContext(dict): ctx.tmpfilename = self.temp_name(filename) ctx.stream = None - # Do not include the Accept-Encoding header - headers = {'Youtubedl-no-compression': 'True'} + # Disable compression + headers = {'Accept-Encoding': 'identity'} add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) @@ -150,7 +150,8 @@ def establish_connection(): # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload - self.report_unable_to_resume() + elif range_start > 0: + self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) diff --git a/plugin/yt-dlp/yt_dlp/downloader/niconico.py b/plugin/yt-dlp/yt_dlp/downloader/niconico.py index ccca404..c945b63 100644 --- a/plugin/yt-dlp/yt_dlp/downloader/niconico.py +++ b/plugin/yt-dlp/yt_dlp/downloader/niconico.py @@ -1,8 +1,17 @@ +import json import threading +import time from . import get_suitable_downloader from .common import FileDownloader -from ..utils import sanitized_Request +from .external import FFmpegFD +from ..utils import ( + DownloadError, + WebSocketsWrapper, + sanitized_Request, + str_or_none, + try_get, +) class NiconicoDmcFD(FileDownloader): @@ -50,3 +59,93 @@ def heartbeat(): timer[0].cancel() download_complete = True return success + + +class NiconicoLiveFD(FileDownloader): + """ Downloads niconico live without being stopped """ + + def real_download(self, filename, info_dict): + video_id = info_dict['video_id'] + ws_url = info_dict['url'] + ws_extractor = info_dict['ws'] + ws_origin_host = info_dict['origin'] + cookies = info_dict.get('cookies') + live_quality = info_dict.get('live_quality', 'high') + live_latency = info_dict.get('live_latency', 'high') + dl = FFmpegFD(self.ydl, self.params or {}) + + new_info_dict = info_dict.copy() + new_info_dict.update({ + 'protocol': 'm3u8', + }) + + def communicate_ws(reconnect): + if reconnect: + ws = WebSocketsWrapper(ws_url, { + 'Cookies': str_or_none(cookies) or '', + 'Origin': f'https://{ws_origin_host}', + 'Accept': '*/*', + 'User-Agent': self.params['http_headers']['User-Agent'], + }) + if self.ydl.params.get('verbose', False): + self.to_screen('[debug] Sending startWatching request') + ws.send(json.dumps({ + 'type': 'startWatching', + 'data': { + 'stream': { + 'quality': live_quality, + 'protocol': 'hls+fmp4', + 'latency': live_latency, + 'chasePlay': False + }, + 'room': { + 'protocol': 'webSocket', + 'commentable': True + }, + 'reconnect': True, + } + })) + else: + ws = ws_extractor + with ws: + while True: + recv = ws.recv() + if not recv: + continue + data = json.loads(recv) + if not data or not isinstance(data, dict): + continue + if data.get('type') == 'ping': + # pong back + ws.send(r'{"type":"pong"}') + ws.send(r'{"type":"keepSeat"}') + elif data.get('type') == 'disconnect': + self.write_debug(data) + return True + elif data.get('type') == 'error': + self.write_debug(data) + message = try_get(data, lambda x: x['body']['code'], str) or recv + return DownloadError(message) + elif self.ydl.params.get('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.to_screen('[debug] Server said: %s' % recv) + + def ws_main(): + reconnect = False + while True: + try: + ret = communicate_ws(reconnect) + if ret is True: + return + except BaseException as e: + self.to_screen('[%s] %s: Connection error occured, reconnecting after 10 seconds: %s' % ('niconico:live', video_id, str_or_none(e))) + time.sleep(10) + continue + finally: + reconnect = True + + thread = threading.Thread(target=ws_main, daemon=True) + thread.start() + + return dl.download(filename, new_info_dict) diff --git a/plugin/yt-dlp/yt_dlp/extractor/_extractors.py b/plugin/yt-dlp/yt_dlp/extractor/_extractors.py index b054880..c9bd6a4 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/_extractors.py +++ b/plugin/yt-dlp/yt_dlp/extractor/_extractors.py @@ -204,7 +204,11 @@ BFMTVLiveIE, BFMTVArticleIE, ) -from .bibeltv import BibelTVIE +from .bibeltv import ( + BibelTVLiveIE, + BibelTVSeriesIE, + BibelTVVideoIE, +) from .bigflix import BigflixIE from .bigo import BigoIE from .bild import BildIE @@ -247,7 +251,6 @@ from .bostonglobe import BostonGlobeIE from .box import BoxIE from .boxcast import BoxCastVideoIE -from .booyah import BooyahClipsIE from .bpb import BpbIE from .br import ( BRIE, @@ -281,6 +284,10 @@ CamdemyIE, CamdemyFolderIE ) +from .camfm import ( + CamFMEpisodeIE, + CamFMShowIE +) from .cammodels import CamModelsIE from .camsoda import CamsodaIE from .camtasia import CamtasiaEmbedIE @@ -288,12 +295,6 @@ from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) from .carambatv import ( CarambaTVIE, CarambaTVPageIE, @@ -310,14 +311,14 @@ CBSIE, ParamountPressExpressIE, ) -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, -) from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsEmbedIE, CBSNewsIE, + CBSLocalIE, + CBSLocalArticleIE, + CBSLocalLiveIE, + CBSNewsLiveIE, CBSNewsLiveVideoIE, ) from .cbssports import ( @@ -404,9 +405,12 @@ CrowdBunkerIE, CrowdBunkerChannelIE, ) +from .crtvg import CrtvgIE from .crunchyroll import ( CrunchyrollBetaIE, CrunchyrollBetaShowIE, + CrunchyrollMusicIE, + CrunchyrollArtistIE, ) from .cspan import CSpanIE, CSpanCongressIE from .ctsnews import CtsNewsIE @@ -423,6 +427,10 @@ CybraryIE, CybraryCourseIE ) +from .dacast import ( + DacastVODIE, + DacastPlaylistIE, +) from .daftsex import DaftsexIE from .dailymail import DailyMailIE from .dailymotion import ( @@ -512,6 +520,7 @@ DeuxMNewsIE ) from .digitalconcerthall import DigitalConcertHallIE +from .discogs import DiscogsReleasePlaylistIE from .discovery import DiscoveryIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE @@ -536,6 +545,7 @@ from .eighttracks import EightTracksIE from .einthusan import EinthusanIE from .eitb import EitbIE +from .elevensports import ElevenSportsIE from .ellentube import ( EllenTubeIE, EllenTubeVideoIE, @@ -569,6 +579,7 @@ ESPNCricInfoIE, ) from .esri import EsriVideoIE +from .ettutv import EttuTvIE from .europa import EuropaIE, EuroParlWebstreamIE from .europeantour import EuropeanTourIE from .eurosport import EurosportIE @@ -655,6 +666,7 @@ FunimationShowIE, ) from .funk import FunkIE +from .funker530 import Funker530IE from .fusion import FusionIE from .fuyintv import FuyinTVIE from .gab import ( @@ -784,6 +796,7 @@ IchinanaLiveIE, IchinanaLiveClipIE, ) +from .idolplus import IdolPlusIE from .ign import ( IGNIE, IGNVideoIE, @@ -868,6 +881,7 @@ from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .joj import JojIE +from .jstream import JStreamIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE @@ -877,7 +891,6 @@ from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, @@ -1147,6 +1160,7 @@ ) from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE +from .mzaalo import MzaaloIE from .n1 import ( N1InfoAssetIE, N1InfoIIE, @@ -1195,6 +1209,7 @@ NebulaSubscriptionsIE, NebulaChannelIE, ) +from .nekohacker import NekoHackerIE from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( @@ -1245,6 +1260,7 @@ NhkForSchoolProgramListIE, NhkRadioNewsPageIE, NhkRadiruIE, + NhkRadiruLiveIE, ) from .nhl import NHLIE from .nick import ( @@ -1264,6 +1280,7 @@ NicovideoSearchIE, NicovideoSearchURLIE, NicovideoTagURLIE, + NiconicoLiveIE, ) from .ninecninemedia import ( NineCNineMediaIE, @@ -1373,6 +1390,7 @@ ORFIPTVIE, ) from .outsidetv import OutsideTVIE +from .owncloud import OwnCloudIE from .packtpub import ( PacktPubIE, PacktPubCourseIE, @@ -1474,7 +1492,6 @@ PolskieRadioPlayerIE, PolskieRadioPodcastIE, PolskieRadioPodcastListIE, - PolskieRadioRadioKierowcowIE, ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE @@ -1544,6 +1561,8 @@ RadLiveSeasonIE, ) from .rai import ( + RaiIE, + RaiCulturaIE, RaiPlayIE, RaiPlayLiveIE, RaiPlayPlaylistIE, @@ -1552,7 +1571,6 @@ RaiPlaySoundPlaylistIE, RaiNewsIE, RaiSudtirolIE, - RaiIE, ) from .raywenderlich import ( RayWenderlichIE, @@ -1574,6 +1592,7 @@ RCTIPlusTVIE, ) from .rds import RDSIE +from .recurbate import RecurbateIE from .redbee import ParliamentLiveUKIE, RTBFIE from .redbulltv import ( RedBullTVIE, @@ -1610,6 +1629,7 @@ from .rozhlas import ( RozhlasIE, RozhlasVltavaIE, + MujRozhlasIE, ) from .rte import RteIE, RteRadioIE from .rtlnl import ( @@ -2080,7 +2100,6 @@ ) from .tvplay import ( TVPlayIE, - ViafreeIE, TVPlayHomeIE, ) from .tvplayer import TVPlayerIE @@ -2264,7 +2283,12 @@ VoxMediaVolumeIE, VoxMediaIE, ) -from .vrt import VRTIE +from .vrt import ( + VRTIE, + VrtNUIE, + KetnetIE, + DagelijkseKostIE, +) from .vrak import VrakIE from .vrv import ( VRVIE, @@ -2315,7 +2339,16 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .weverse import ( + WeverseIE, + WeverseMediaIE, + WeverseMomentIE, + WeverseLiveTabIE, + WeverseMediaTabIE, + WeverseLiveIE, +) from .wevidi import WeVidiIE +from .weyyak import WeyyakIE from .whyp import WhypIE from .wikimedia import WikimediaIE from .willow import WillowIE @@ -2344,6 +2377,12 @@ WSJArticleIE, ) from .wwe import WWEIE +from .wykop import ( + WykopDigIE, + WykopDigCommentIE, + WykopPostIE, + WykopPostCommentIE, +) from .xanimu import XanimuIE from .xbef import XBefIE from .xboxclips import XboxClipsIE @@ -2388,7 +2427,10 @@ ZenYandexChannelIE, ) from .yapfiles import YapFilesIE -from .yappy import YappyIE +from .yappy import ( + YappyIE, + YappyProfileIE, +) from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .yle_areena import YleAreenaIE @@ -2406,6 +2448,10 @@ from .youporn import YouPornIE from .yourporn import YourPornIE from .yourupload import YourUploadIE +from .zaiko import ( + ZaikoIE, + ZaikoETicketIE, +) from .zapiks import ZapiksIE from .zattoo import ( BBVTVIE, @@ -2463,6 +2509,7 @@ ZingMp3WeekChartIE, ZingMp3ChartMusicVideoIE, ZingMp3UserIE, + ZingMp3HubIE, ) from .zoom import ZoomIE from .zype import ZypeIE diff --git a/plugin/yt-dlp/yt_dlp/extractor/acast.py b/plugin/yt-dlp/yt_dlp/extractor/acast.py index b88846b..3b877bc 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/acast.py +++ b/plugin/yt-dlp/yt_dlp/extractor/acast.py @@ -40,28 +40,33 @@ def _call_api(self, path, video_id, query=None): class ACastIE(ACastBaseIE): IE_NAME = 'acast' - _VALID_URL = r'''(?x) + _VALID_URL = r'''(?x: https?:// (?: (?:(?:embed|www)\.)?acast\.com/| play\.acast\.com/s/ ) - (?P[^/]+)/(?P[^/#?]+) - ''' + (?P[^/]+)/(?P[^/#?"]+) + )''' + _EMBED_REGEX = [rf'(?x)]+\bsrc=[\'"](?P{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', + 'description': 'md5:013959207e05011ad14a222cf22278cc', 'timestamp': 1477346700, 'upload_date': '20161024', 'duration': 2766, - 'creator': 'Anton Berg & Martin Johnson', + 'creator': 'Third Ear Studio', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', + 'thumbnail': 'https://assets.pippa.io/shows/616ebe1886d7b1398620b943/616ebe33c7e6e70013cae7da.jpg', + 'episode_number': 2, + 'display_id': '2.raggarmordet-rosterurdetforflutna', + 'season_number': 4, + 'season': 'Season 4', } }, { 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', @@ -73,6 +78,23 @@ class ACastIE(ACastBaseIE): 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'https://ausi.anu.edu.au/news/democracy-sausage-episode-can-labor-be-long-form-government', + 'info_dict': { + 'id': '646c68fb21fbf20011e9c651', + 'ext': 'mp3', + 'creator': 'The Australian National University', + 'display_id': 'can-labor-be-a-long-form-government', + 'duration': 2618, + 'thumbnail': 'https://assets.pippa.io/shows/6113e8578b4903809f16f7e5/1684821529295-515b9520db9ce53275b995eb302f941c.jpeg', + 'title': 'Can Labor be a long-form government?', + 'episode': 'Can Labor be a long-form government?', + 'upload_date': '20230523', + 'series': 'Democracy Sausage with Mark Kenny', + 'timestamp': 1684826362, + 'description': 'md5:feabe1fc5004c78ee59c84a46bf4ba16', + } + }] def _real_extract(self, url): channel, display_id = self._match_valid_url(url).groups() diff --git a/plugin/yt-dlp/yt_dlp/extractor/aenetworks.py b/plugin/yt-dlp/yt_dlp/extractor/aenetworks.py index 50c34b5..ca06ace 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/aenetworks.py +++ b/plugin/yt-dlp/yt_dlp/extractor/aenetworks.py @@ -3,6 +3,8 @@ ExtractorError, GeoRestrictedError, int_or_none, + remove_start, + traverse_obj, update_url_query, urlencode_postdata, ) @@ -72,7 +74,14 @@ def _extract_aetn_info(self, domain, filter_key, filter_value, url): requestor_id, brand = self._DOMAIN_MAP[domain] result = self._download_json( 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + filter_value, query={'filter[%s]' % filter_key: filter_value}) + result = traverse_obj( + result, ('results', + lambda k, v: k == 0 and v[filter_key] == filter_value), + get_all=False) + if not result: + raise ExtractorError('Show not found in A&E feed (too new?)', expected=True, + video_id=remove_start(filter_value, '/')) title = result['title'] video_id = result['id'] media_url = result['publicUrl'] @@ -123,7 +132,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'This video is only available for users of participating TV providers.', + 'skip': 'Geo-restricted - This content is not available in your location.' }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { @@ -140,6 +149,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -303,6 +313,7 @@ def _real_extract(self, url): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = self._match_valid_url(url).groups() diff --git a/plugin/yt-dlp/yt_dlp/extractor/afreecatv.py b/plugin/yt-dlp/yt_dlp/extractor/afreecatv.py index aedbdfc..430bf46 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/afreecatv.py +++ b/plugin/yt-dlp/yt_dlp/extractor/afreecatv.py @@ -76,59 +76,6 @@ class AfreecaTVIE(InfoExtractor): }, }], 'skip': 'Video is gone', - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', - 'info_dict': { - 'id': '18650793', - 'ext': 'mp4', - 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': '윈아디', - 'uploader_id': 'badkids', - 'duration': 107, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652', - 'info_dict': { - 'id': '10481652', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'duration': 6492, - }, - 'playlist_count': 2, - 'playlist': [{ - 'md5': 'd8b7c174568da61d774ef0203159bf97', - 'info_dict': { - 'id': '20160502_c4c62b9d_174361386_1', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 3601, - }, - }, { - 'md5': '58f2ce7f6044e34439ab2d50612ab02b', - 'info_dict': { - 'id': '20160502_39e739bb_174361386_2', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 2891, - }, - }], - 'params': { - 'skip_download': True, - }, }, { # non standard key 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', @@ -146,8 +93,8 @@ class AfreecaTVIE(InfoExtractor): 'skip_download': True, }, }, { - # PARTIAL_ADULT - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', + # adult content + 'url': 'https://vod.afreecatv.com/player/97267690', 'info_dict': { 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', @@ -161,16 +108,25 @@ class AfreecaTVIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['adult content'], + 'skip': 'The VOD does not exist', }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', - 'only_matching': True, - }, { - 'url': 'http://vod.afreecatv.com/player/15055030', - 'only_matching': True, + 'url': 'https://vod.afreecatv.com/player/96753363', + 'info_dict': { + 'id': '20230108_9FF5BEE1_244432674_1', + 'ext': 'mp4', + 'uploader_id': 'rlantnghks', + 'uploader': '페이즈으', + 'duration': 10840, + 'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r', + 'upload_date': '20230108', + 'title': '젠지 페이즈', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -223,26 +179,21 @@ def _perform_login(self, username, password): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if re.search(r'alert\(["\']This video has been deleted', webpage): - raise ExtractorError( - 'Video %s has been deleted' % video_id, expected=True) - - station_id = self._search_regex( - r'nStationNo\s*=\s*(\d+)', webpage, 'station') - bbs_id = self._search_regex( - r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') - video_id = self._search_regex( - r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - partial_view = False adult_view = False for _ in range(2): + data = self._download_json( + 'https://api.m.afreecatv.com/station/video/a/view', + video_id, headers={'Referer': url}, data=urlencode_postdata({ + 'nTitleNo': video_id, + 'nApiLevel': 10, + }))['data'] + if traverse_obj(data, ('code', {int})) == -6221: + raise ExtractorError('The VOD does not exist', expected=True) query = { 'nTitleNo': video_id, - 'nStationNo': station_id, - 'nBbsNo': bbs_id, + 'nStationNo': data['station_no'], + 'nBbsNo': data['bbs_no'], } if partial_view: query['partialView'] = 'SKIP_ADULT' diff --git a/plugin/yt-dlp/yt_dlp/extractor/amp.py b/plugin/yt-dlp/yt_dlp/extractor/amp.py index e34c843..b8d1277 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/amp.py +++ b/plugin/yt-dlp/yt_dlp/extractor/amp.py @@ -5,6 +5,7 @@ int_or_none, mimetype2ext, parse_iso8601, + strip_jsonp, unified_timestamp, url_or_none, ) @@ -15,7 +16,7 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _extract_feed_info(self, url): feed = self._download_json( url, None, 'Downloading Akamai AMP feed', - 'Unable to download Akamai AMP feed') + 'Unable to download Akamai AMP feed', transform_source=strip_jsonp) item = feed.get('channel', {}).get('item') if not item: raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error'])) @@ -73,8 +74,10 @@ def get_media_node(name, default=None): media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), diff --git a/plugin/yt-dlp/yt_dlp/extractor/anvato.py b/plugin/yt-dlp/yt_dlp/extractor/anvato.py index 951482f..6c94aa7 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/anvato.py +++ b/plugin/yt-dlp/yt_dlp/extractor/anvato.py @@ -336,7 +336,7 @@ def _get_anvato_videos(self, access_key, video_id, token): elif media_format == 'm3u8-variant' or ext == 'm3u8': # For some videos the initial m3u8 URL returns JSON instead manifest_json = self._download_json( - video_url, video_id, note='Downloading manifest JSON', errnote=False) + video_url, video_id, note='Downloading manifest JSON', fatal=False) if manifest_json: video_url = manifest_json.get('master_m3u8') if not video_url: @@ -392,14 +392,6 @@ def _extract_from_webpage(cls, url, webpage): url = smuggle_url(url, {'token': anvplayer_data['token']}) yield cls.url_result(url, AnvatoIE, video_id) - def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json( - self._html_search_regex( - self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), - video_id) - return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default' - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) self._initialize_geo_bypass({ diff --git a/plugin/yt-dlp/yt_dlp/extractor/ard.py b/plugin/yt-dlp/yt_dlp/extractor/ard.py index a723517..a326c6b 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/ard.py +++ b/plugin/yt-dlp/yt_dlp/extractor/ard.py @@ -13,6 +13,7 @@ try_get, unified_strdate, unified_timestamp, + update_url, update_url_query, url_or_none, xpath_text, @@ -408,6 +409,23 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): (?(playlist)/(?P\d+)?/?(?:[?#]|$))''' _TESTS = [{ + 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI', + 'md5': '3fd5fead7a370a819341129c8d713136', + 'info_dict': { + 'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen', + 'id': '12172961', + 'title': 'Wolfsland - Die traurigen Schwestern', + 'description': r're:^Als der Polizeiobermeister Raaben', + 'duration': 5241, + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957', + 'timestamp': 1670710500, + 'upload_date': '20221210', + 'ext': 'mp4', + 'age_limit': 12, + 'episode': 'Wolfsland - Die traurigen Schwestern', + 'series': 'Filme im MDR' + }, + }, { 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { @@ -424,7 +442,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'skip': 'Error', }, { 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', - 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'md5': '1e73ded21cb79bac065117e80c81dc88', 'info_dict': { 'id': '10049223', 'ext': 'mp4', @@ -432,13 +450,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'timestamp': 1636398000, 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', 'upload_date': '20211108', - }, - }, { - 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', - 'playlist_count': 6, - 'info_dict': { - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', - 'title': 'beforeigners/beforeigners/staffel-1', + 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste', + 'duration': 915, + 'episode': 'tagesschau, 20:00 Uhr', + 'series': 'tagesschau', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49', }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', @@ -602,6 +618,9 @@ def _real_extract(self, url): show { title } + image { + src + } synopsis title tracking { @@ -640,6 +659,15 @@ def _real_extract(self, url): 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), 'series': try_get(player_page, lambda x: x['show']['title']), + 'thumbnail': (media_collection.get('_previewImage') + or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None)) + or self.get_thumbnail_from_html(display_id, url)), }) info.update(self._ARD_extract_episode_info(info['title'])) return info + + def get_thumbnail_from_html(self, display_id, url): + webpage = self._download_webpage(url, display_id, fatal=False) or '' + return ( + self._og_search_thumbnail(webpage, default=None) + or self._html_search_meta('thumbnailUrl', webpage, default=None)) diff --git a/plugin/yt-dlp/yt_dlp/extractor/bibeltv.py b/plugin/yt-dlp/yt_dlp/extractor/bibeltv.py index bdd7666..f6874e8 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/bibeltv.py +++ b/plugin/yt-dlp/yt_dlp/extractor/bibeltv.py @@ -1,27 +1,197 @@ +from functools import partial + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + format_field, + int_or_none, + js_to_json, + orderedSet, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class BibelTVBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['AT', 'CH', 'DE'] + _GEO_BYPASS = False + + API_URL = 'https://www.bibeltv.de/mediathek/api' + AUTH_TOKEN = 'j88bRXY8DsEqJ9xmTdWhrByVi5Hm' + + def _extract_formats_and_subtitles(self, data, crn_id, *, is_live=False): + formats = [] + subtitles = {} + for media_url in traverse_obj(data, (..., 'src', {url_or_none})): + media_ext = determine_ext(media_url) + if media_ext == 'm3u8': + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + media_url, crn_id, live=is_live) + formats.extend(m3u8_formats) + subtitles.update(m3u8_subs) + elif media_ext == 'mpd': + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(media_url, crn_id) + formats.extend(mpd_formats) + subtitles.update(mpd_subs) + elif media_ext == 'mp4': + formats.append({'url': media_url}) + else: + self.report_warning(f'Unknown format {media_ext!r}') + + return formats, subtitles + + @staticmethod + def _extract_base_info(data): + return { + 'id': data['crn'], + **traverse_obj(data, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {partial(int_or_none, scale=1000)}), + 'timestamp': ('schedulingStart', {parse_iso8601}), + 'season_number': 'seasonNumber', + 'episode_number': 'episodeNumber', + 'view_count': 'viewCount', + 'like_count': 'likeCount', + }), + 'thumbnails': orderedSet(traverse_obj(data, ('images', ..., { + 'url': ('url', {url_or_none}), + }))), + } + + def _extract_url_info(self, data): + return { + '_type': 'url', + 'url': format_field(data, 'slug', 'https://www.bibeltv.de/mediathek/videos/%s'), + **self._extract_base_info(data), + } + + def _extract_video_info(self, data): + crn_id = data['crn'] + if data.get('drm'): + self.report_drm(crn_id) + + json_data = self._download_json( + format_field(data, 'id', f'{self.API_URL}/video/%s'), crn_id, + headers={'Authorization': self.AUTH_TOKEN}, fatal=False, + errnote='No formats available') or {} + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('video', 'videoUrls', ...)), crn_id) + + return { + '_type': 'video', + **self._extract_base_info(data), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BibelTVVideoIE(BibelTVBaseIE): + IE_DESC = 'BibelTV single video' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?P\d+)[\w-]+' + IE_NAME = 'bibeltv:video' -class BibelTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P\d+)' _TESTS = [{ - 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', - 'md5': '252f908192d611de038b8504b08bf97f', + 'url': 'https://www.bibeltv.de/mediathek/videos/344436-alte-wege', + 'md5': 'ec1c07efe54353780512e8a4103b612e', 'info_dict': { - 'id': 'ref:329703', + 'id': '344436', 'ext': 'mp4', - 'title': 'Sprachkurs in Malaiisch', - 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', - 'timestamp': 1608316701, - 'uploader_id': '5840105145001', - 'upload_date': '20201218', - } + 'title': 'Alte Wege', + 'description': 'md5:2f4eb7294c9797a47b8fd13cccca22e9', + 'timestamp': 1677877071, + 'duration': 150.0, + 'upload_date': '20230303', + 'thumbnail': r're:https://bibeltv\.imgix\.net/[\w-]+\.jpg', + 'episode': 'Episode 1', + 'episode_number': 1, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'format': '6', + }, + }] + + def _real_extract(self, url): + crn_id = self._match_id(url) + video_data = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, crn_id), crn_id), + ('props', 'pageProps', 'videoPageData', 'videos', 0, {dict})) + if not video_data: + raise ExtractorError('Missing video data.') + + return self._extract_video_info(video_data) + + +class BibelTVSeriesIE(BibelTVBaseIE): + IE_DESC = 'BibelTV series playlist' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/serien/(?P\d+)[\w-]+' + IE_NAME = 'bibeltv:series' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/serien/333485-ein-wunder-fuer-jeden-tag', + 'playlist_mincount': 400, + 'info_dict': { + 'id': '333485', + 'title': 'Ein Wunder für jeden Tag', + 'description': 'Tägliche Kurzandacht mit Déborah Rosenkranz.', + }, + }] + + def _real_extract(self, url): + crn_id = self._match_id(url) + webpage = self._download_webpage(url, crn_id) + nextjs_data = self._search_nextjs_data(webpage, crn_id) + series_data = traverse_obj(nextjs_data, ('props', 'pageProps', 'seriePageData', {dict})) + if not series_data: + raise ExtractorError('Missing series data.') + + return self.playlist_result( + traverse_obj(series_data, ('videos', ..., {dict}, {self._extract_url_info})), + crn_id, series_data.get('title'), clean_html(series_data.get('description'))) + + +class BibelTVLiveIE(BibelTVBaseIE): + IE_DESC = 'BibelTV live program' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/livestreams/(?P[\w-]+)' + IE_NAME = 'bibeltv:live' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/livestreams/bibeltv/', + 'info_dict': { + 'id': 'bibeltv', + 'ext': 'mp4', + 'title': 're:Bibel TV', + 'live_status': 'is_live', + 'thumbnail': 'https://streampreview.bibeltv.de/bibeltv.webp', + }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', + 'url': 'https://www.bibeltv.de/livestreams/impuls/', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): - crn_id = self._match_id(url) - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') + stream_id = self._match_id(url) + webpage = self._download_webpage(url, stream_id) + stream_data = self._search_json( + r'\\"video\\":', webpage, 'bibeltvData', stream_id, + transform_source=lambda jstring: js_to_json(jstring.replace('\\"', '"'))) + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(stream_data, ('src', ...)), stream_id, is_live=True) + + return { + 'id': stream_id, + 'title': stream_data.get('title'), + 'thumbnail': stream_data.get('poster'), + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/bilibili.py b/plugin/yt-dlp/yt_dlp/extractor/bilibili.py index 87951aa..503a9b1 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/bilibili.py +++ b/plugin/yt-dlp/yt_dlp/extractor/bilibili.py @@ -1,7 +1,9 @@ import base64 import functools +import hashlib import itertools import math +import time import urllib.error import urllib.parse @@ -26,6 +28,7 @@ srt_subtitles_timecode, str_or_none, traverse_obj, + try_call, unified_timestamp, unsmuggle_url, url_or_none, @@ -514,19 +517,63 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): 'id': '3985676', }, 'playlist_mincount': 178, + }, { + 'url': 'https://space.bilibili.com/313580179/video', + 'info_dict': { + 'id': '313580179', + }, + 'playlist_mincount': 92, }] + def _extract_signature(self, playlist_id): + session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False) + + key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0] + img_key = traverse_obj( + session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100' + sub_key = traverse_obj( + session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6' + + session_key = img_key + sub_key + + signature_values = [] + for position in ( + 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, + 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, + 57, 62, 11, 36, 20, 34, 44, 52 + ): + char_at_position = try_call(lambda: session_key[position]) + if char_at_position: + signature_values.append(char_at_position) + + return ''.join(signature_values)[:32] + def _real_extract(self, url): playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video') if not is_video_url: self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. ' 'To download audios, add a "/audio" to the URL') + signature = self._extract_signature(playlist_id) + def fetch_page(page_idx): + query = { + 'keyword': '', + 'mid': playlist_id, + 'order': 'pubdate', + 'order_avoided': 'true', + 'platform': 'web', + 'pn': page_idx + 1, + 'ps': 30, + 'tid': 0, + 'web_location': 1550101, + 'wts': int(time.time()), + } + query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest() + try: - response = self._download_json('https://api.bilibili.com/x/space/arc/search', - playlist_id, note=f'Downloading page {page_idx}', - query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'}) + response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search', + playlist_id, note=f'Downloading page {page_idx}', query=query) except ExtractorError as e: if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: raise ExtractorError( @@ -556,9 +603,9 @@ def get_entries(page_data): class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)/audio' _TESTS = [{ - 'url': 'https://space.bilibili.com/3985676/audio', + 'url': 'https://space.bilibili.com/313580179/audio', 'info_dict': { - 'id': '3985676', + 'id': '313580179', }, 'playlist_mincount': 1, }] diff --git a/plugin/yt-dlp/yt_dlp/extractor/booyah.py b/plugin/yt-dlp/yt_dlp/extractor/booyah.py deleted file mode 100644 index e2dfff3..0000000 --- a/plugin/yt-dlp/yt_dlp/extractor/booyah.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none, str_or_none, traverse_obj - - -class BooyahBaseIE(InfoExtractor): - _BOOYAH_SESSION_KEY = None - - def _real_initialize(self): - BooyahBaseIE._BOOYAH_SESSION_KEY = self._request_webpage( - 'https://booyah.live/api/v3/auths/sessions', None, data=b'').getheader('booyah-session-key') - - def _get_comments(self, video_id): - comment_json = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}/comments/tops', video_id, - headers={'Booyah-Session-Key': self._BOOYAH_SESSION_KEY}, fatal=False) or {} - - return [{ - 'id': comment.get('comment_id'), - 'author': comment.get('from_nickname'), - 'author_id': comment.get('from_uid'), - 'author_thumbnail': comment.get('from_thumbnail'), - 'text': comment.get('content'), - 'timestamp': comment.get('create_time'), - 'like_count': comment.get('like_cnt'), - } for comment in comment_json.get('comment_list') or ()] - - -class BooyahClipsIE(BooyahBaseIE): - _VALID_URL = r'https?://booyah.live/clips/(?P\d+)' - _TESTS = [{ - 'url': 'https://booyah.live/clips/13887261322952306617', - 'info_dict': { - 'id': '13887261322952306617', - 'ext': 'mp4', - 'view_count': int, - 'duration': 30, - 'channel_id': 90565760, - 'like_count': int, - 'title': 'Cayendo con estilo 😎', - 'uploader': '♡LɪꜱGΛ​MER​', - 'comment_count': int, - 'uploader_id': '90565760', - 'thumbnail': 'https://resmambet-a.akamaihd.net/mambet-storage/Clip/90565760/90565760-27204374-fba0-409d-9d7b-63a48b5c0e75.jpg', - 'upload_date': '20220617', - 'timestamp': 1655490556, - 'modified_timestamp': 1655490556, - 'modified_date': '20220617', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - json_data = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}', video_id, - headers={'Booyah-Session-key': self._BOOYAH_SESSION_KEY}) - - formats = [] - for video_data in json_data['playback']['endpoint_list']: - formats.extend(({ - 'url': video_data.get('stream_url'), - 'ext': 'mp4', - 'height': video_data.get('resolution'), - }, { - 'url': video_data.get('download_url'), - 'ext': 'mp4', - 'format_note': 'Watermarked', - 'height': video_data.get('resolution'), - 'preference': -10, - })) - - return { - 'id': video_id, - 'title': traverse_obj(json_data, ('playback', 'name')), - 'thumbnail': traverse_obj(json_data, ('playback', 'thumbnail_url')), - 'formats': formats, - 'view_count': traverse_obj(json_data, ('playback', 'views')), - 'like_count': traverse_obj(json_data, ('playback', 'likes')), - 'duration': traverse_obj(json_data, ('playback', 'duration')), - 'comment_count': traverse_obj(json_data, ('playback', 'comment_cnt')), - 'channel_id': traverse_obj(json_data, ('playback', 'channel_id')), - 'uploader': traverse_obj(json_data, ('user', 'nickname')), - 'uploader_id': str_or_none(traverse_obj(json_data, ('user', 'uid'))), - 'modified_timestamp': int_or_none(traverse_obj(json_data, ('playback', 'update_time_ms')), 1000), - 'timestamp': int_or_none(traverse_obj(json_data, ('playback', 'create_time_ms')), 1000), - '__post_extractor': self.extract_comments(video_id, self._get_comments(video_id)), - } diff --git a/plugin/yt-dlp/yt_dlp/extractor/bravotv.py b/plugin/yt-dlp/yt_dlp/extractor/bravotv.py index a2a8e90..3e0b15b 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/bravotv.py +++ b/plugin/yt-dlp/yt_dlp/extractor/bravotv.py @@ -1,5 +1,6 @@ from .adobepass import AdobePassIE from ..utils import ( + HEADRequest, extract_attributes, float_or_none, get_element_html_by_class, @@ -153,8 +154,11 @@ def _real_extract(self, url): if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): chapters = None - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - update_url_query(f'{tp_url}/stream.m3u8', query), video_id, 'mp4', m3u8_id='hls') + m3u8_url = self._request_webpage(HEADRequest( + update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').geturl() + if 'mpeg_cenc' in m3u8_url: + self.report_drm(video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') return { 'id': video_id, diff --git a/plugin/yt-dlp/yt_dlp/extractor/camfm.py b/plugin/yt-dlp/yt_dlp/extractor/camfm.py new file mode 100644 index 0000000..a9850f4 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/camfm.py @@ -0,0 +1,85 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + get_elements_by_class, + join_nonempty, + traverse_obj, + unified_timestamp, + urljoin, +) + + +class CamFMShowIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/shows/(?P[^/]+)' + _TESTS = [{ + 'playlist_mincount': 5, + 'url': 'https://camfm.co.uk/shows/soul-mining/', + 'info_dict': { + 'id': 'soul-mining', + 'thumbnail': 'md5:6a873091f92c936f23bdcce80f75e66a', + 'title': 'Soul Mining', + 'description': 'Telling the stories of jazz, funk and soul from all corners of the world.', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + page = self._download_webpage(url, show_id) + + return { + '_type': 'playlist', + 'id': show_id, + 'entries': [self.url_result(urljoin('https://camfm.co.uk', i), CamFMEpisodeIE) + for i in re.findall(r"javascript:popup\('(/player/[^']+)', 'listen'", page)], + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r']+class="thumb-expand"[^>]+src="([^"]+)"', page, 'thumbnail', fatal=False)), + 'title': self._html_search_regex('

([^<]+)

', page, 'title', fatal=False), + 'description': clean_html(get_element_by_class('small-12 medium-8 cell', page)) + } + + +class CamFMEpisodeIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/player/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://camfm.co.uk/player/43336', + 'skip': 'Episode will expire - don\'t actually know when, but it will go eventually', + 'info_dict': { + 'id': '43336', + 'title': 'AITAA: Am I the Agony Aunt? - 19:00 Tue 16/05/2023', + 'ext': 'mp3', + 'upload_date': '20230516', + 'description': 'md5:f165144f94927c0f1bfa2ee6e6ab7bbf', + 'timestamp': 1684263600, + 'series': 'AITAA: Am I the Agony Aunt?', + 'thumbnail': 'md5:5980a831360d0744c3764551be3d09c1', + 'categories': ['Entertainment'], + } + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + page = self._download_webpage(url, episode_id) + audios = self._parse_html5_media_entries('https://audio.camfm.co.uk', page, episode_id) + + caption = get_element_by_class('caption', page) + series = clean_html(re.sub(r'', '', caption)) + + card_section = get_element_by_class('card-section', page) + date = self._html_search_regex('>Aired at ([^<]+)<', card_section, 'air date', fatal=False) + + return { + 'id': episode_id, + 'title': join_nonempty(series, date, delim=' - '), + 'formats': traverse_obj(audios, (..., 'formats', ...)), + 'timestamp': unified_timestamp(date), # XXX: Does not account for UK's daylight savings + 'series': series, + 'description': clean_html(re.sub(r'[^<]+]+/>', '', card_section)), + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r']+class="cover-art"[^>]+style="[^"]+url\(\'([^\']+)', + page, 'thumbnail', fatal=False)), + 'categories': get_elements_by_class('label', caption), + 'was_live': True, + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/canvas.py b/plugin/yt-dlp/yt_dlp/extractor/canvas.py deleted file mode 100644 index b835444..0000000 --- a/plugin/yt-dlp/yt_dlp/extractor/canvas.py +++ /dev/null @@ -1,383 +0,0 @@ -import json - - -from .common import InfoExtractor -from .gigya import GigyaBaseIE -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - clean_html, - extract_attributes, - float_or_none, - get_element_by_class, - int_or_none, - merge_dicts, - str_or_none, - strip_or_none, - url_or_none, - urlencode_postdata -) - - -class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'mp4', - 'title': 'Nachtwacht: De Greystook', - 'description': 'Nachtwacht: De Greystook', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'only_matching': True, - }] - _GEO_BYPASS = False - _HLS_ENTRY_PROTOCOLS_MAP = { - 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8_native', - } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, video_id = mobj.group('site_id'), mobj.group('id') - - data = None - if site_id != 'vrtvideo': - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) - - # New API endpoint - if not data: - vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', - video_id, note='refreshtoken: Retrieve vrtnutoken', - errnote='refreshtoken failed')['vrtnutoken'] - headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json; charset=utf-8'}) - vrtPlayerToken = self._download_json( - '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', headers=headers, data=json.dumps({ - 'identityToken': vrtnutoken - }).encode('utf-8'))['vrtPlayerToken'] - data = self._download_json( - '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': vrtPlayerToken, - 'client': 'null', - }, expected_status=400) - if 'title' not in data: - code = data.get('code') - if code == 'AUTHENTICATION_REQUIRED': - self.raise_login_required() - elif code == 'INVALID_LOCATION': - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(data.get('message') or code, expected=True) - - # Note: The title may be an empty string - title = data['title'] or f'{site_id} {video_id}' - description = data.get('description') - - formats = [] - subtitles = {} - for target in data['targetUrls']: - format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) - if not format_url or not format_type: - continue - format_type = format_type.upper() - if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: - fmts, subs = self._extract_m3u8_formats_and_subtitles( - format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], - m3u8_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_type, fatal=False)) - elif format_type == 'MPEG_DASH': - fmts, subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HSS': - fmts, subs = self._extract_ism_formats_and_subtitles( - format_url, video_id, ism_id='mss', fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, - } - - -class CanvasEenIE(InfoExtractor): - IE_DESC = 'canvas.be and een.be' - _VALID_URL = r'https?://(?:www\.)?(?Pcanvas|een)\.be/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', - 'md5': 'ed66976748d12350b118455979cca293', - 'info_dict': { - 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', - 'ext': 'flv', - 'title': 'De afspraak veilt voor de Warmste Week', - 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 49.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - # with subtitles - 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', - 'info_dict': { - 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', - 'display_id': 'pieter-0167', - 'ext': 'mp4', - 'title': 'Pieter 0167', - 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2553.08, - 'subtitles': { - 'nl': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Pagina niet gevonden', - }, { - 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', - 'info_dict': { - 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', - 'display_id': 'emma-pakt-thilly-aan', - 'ext': 'mp4', - 'title': 'Emma pakt Thilly aan', - 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 118.24, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, display_id = mobj.group('site_id'), mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(self._search_regex( - r']+class="video__body__header__title"[^>]*>(.+?)', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None)) - - video_id = self._html_search_regex( - r'data-video=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - } - - -class VrtNUIE(GigyaBaseIE): - IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' - _TESTS = [{ - # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', - 'info_dict': { - 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', - 'ext': 'mp4', - 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', - 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', - 'duration': 1457.04, - 'thumbnail': r're:^https?://.*\.jpg$', - 'series': 'Postbus X', - 'season': 'Seizoen 1989', - 'season_number': 1989, - 'episode': 'De zwarte weduwe', - 'episode_number': 1, - 'timestamp': 1595822400, - 'upload_date': '20200727', - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['is not a supported codec'], - }, { - # Only available via new API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', - 'info_dict': { - 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', - 'ext': 'mp4', - 'title': 'Aflevering 5', - 'description': 'Wie valt door de mand tijdens een missie?', - 'duration': 2967.06, - 'season': 'Season 1', - 'season_number': 1, - 'episode_number': 5, - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], - }] - _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' - _CONTEXT_ID = 'R3595707040' - - def _perform_login(self, username, password): - auth_info = self._gigya_login({ - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - }) - - if auth_info.get('errorDetails'): - raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) - - # Sometimes authentication fails for no good reason, retry - login_attempt = 1 - while login_attempt <= 3: - try: - self._request_webpage('https://token.vrt.be/vrtnuinitlogin', - None, note='Requesting XSRF Token', errnote='Could not get XSRF Token', - query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'}) - - post_data = { - 'UID': auth_info['UID'], - 'UIDSignature': auth_info['UIDSignature'], - 'signatureTimestamp': auth_info['signatureTimestamp'], - '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - } - - self._request_webpage( - 'https://login.vrt.be/perform_login', - None, note='Performing login', errnote='perform login failed', - headers={}, query={ - 'client_id': 'vrtnu-site' - }, data=urlencode_postdata(post_data)) - - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - login_attempt += 1 - self.report_warning('Authentication failed') - self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') - else: - raise e - else: - break - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - attrs = extract_attributes(self._search_regex( - r'(]+>)', webpage, 'media element')) - video_id = attrs['videoid'] - publication_id = attrs.get('publicationid') - if publication_id: - video_id = publication_id + '$' + video_id - - page = (self._parse_json(self._search_regex( - r'digitalData\s*=\s*({.+?});', webpage, 'digial data', - default='{}'), video_id, fatal=False) or {}).get('page') or {} - - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts(info, { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'season_number': int_or_none(page.get('episode_season')), - }) - - -class DagelijkseKostIE(InfoExtractor): - IE_DESC = 'dagelijksekost.een.be' - _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P[^/?#&]+)' - _TEST = { - 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', - 'md5': '30bfffc323009a3e5f689bef6efa2365', - 'info_dict': { - 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', - 'display_id': 'hachis-parmentier-met-witloof', - 'ext': 'mp4', - 'title': 'Hachis parmentier met witloof', - 'description': 'md5:9960478392d87f63567b5b117688cdc5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 283.02, - }, - 'expected_warnings': ['is not a supported codec'], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(get_element_by_class( - 'dish-metadata__title', webpage - ) or self._html_search_meta( - 'twitter:title', webpage)) - - description = clean_html(get_element_by_class( - 'dish-description', webpage) - ) or self._html_search_meta( - ('description', 'twitter:description', 'og:description'), - webpage) - - video_id = self._html_search_regex( - r'data-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - } diff --git a/plugin/yt-dlp/yt_dlp/extractor/cbc.py b/plugin/yt-dlp/yt_dlp/extractor/cbc.py index 22161d1..a6a7123 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/cbc.py +++ b/plugin/yt-dlp/yt_dlp/extractor/cbc.py @@ -351,7 +351,9 @@ def _find_secret_formats(self, formats, video_id): def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + video_info = self._download_json( + f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}', + video_id, expected_status=426) email, password = self._get_login_info() if email and password: @@ -426,7 +428,7 @@ def _real_extract(self, url): match = self._match_valid_url(url) season_id = match.group('id') show = match.group('show') - show_info = self._download_json(self._API_BASE + show, season_id) + show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426) season = int(match.group('season')) season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) diff --git a/plugin/yt-dlp/yt_dlp/extractor/cbslocal.py b/plugin/yt-dlp/yt_dlp/extractor/cbslocal.py deleted file mode 100644 index 0c904fe..0000000 --- a/plugin/yt-dlp/yt_dlp/extractor/cbslocal.py +++ /dev/null @@ -1,116 +0,0 @@ -from .anvato import AnvatoIE -from .sendtonews import SendtoNewsIE -from ..compat import compat_urlparse -from ..utils import ( - parse_iso8601, - unified_timestamp, -) - - -class CBSLocalIE(AnvatoIE): # XXX: Do not subclass from concrete IE - _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' - _VALID_URL = _VALID_URL_BASE + r'video/(?P\d+)' - - _TESTS = [{ - 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', - 'info_dict': { - 'id': '3580809', - 'ext': 'mp4', - 'title': 'A Very Blue Anniversary', - 'description': 'CBS2’s Cindy Hsu has more.', - 'thumbnail': 're:^https?://.*', - 'timestamp': int, - 'upload_date': r're:^\d{8}$', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\WCBSTV', - 'Syndication\\AOL', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\Yahoo', - 'Content\\News', - 'Content\\News\\Local News', - ], - 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - mcp_id = self._match_id(url) - return self.url_result( - 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) - - -class CBSLocalArticleIE(AnvatoIE): # XXX: Do not subclass from concrete IE - _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P[0-9a-z-]+)' - - _TESTS = [{ - # Anvato backend - 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', - 'md5': 'f0ee3081e3843f575fccef901199b212', - 'info_dict': { - 'id': '3401037', - 'ext': 'mp4', - 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', - 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', - 'thumbnail': 're:^https?://.*', - 'timestamp': 1463440500, - 'upload_date': '20160516', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\KCBSTV', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\AOL', - 'Syndication\\Yahoo', - 'Syndication\\Tribune', - 'Syndication\\Curb.tv', - 'Content\\News' - ], - 'tags': ['CBS 2 News Evening'], - }, - }, { - # SendtoNews embed - 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', - 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588', - }, - 'playlist_count': 9, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - sendtonews_url = SendtoNewsIE._extract_url(webpage) - if sendtonews_url: - return self.url_result( - compat_urlparse.urljoin(url, sendtonews_url), - ie=SendtoNewsIE.ie_key()) - - info_dict = self._extract_anvato_videos(webpage, display_id) - - timestamp = unified_timestamp(self._html_search_regex( - r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, - 'released date', default=None)) or parse_iso8601( - self._html_search_meta('uploadDate', webpage)) - - info_dict.update({ - 'display_id': display_id, - 'timestamp': timestamp, - }) - - return info_dict diff --git a/plugin/yt-dlp/yt_dlp/extractor/cbsnews.py b/plugin/yt-dlp/yt_dlp/extractor/cbsnews.py index 847a31d..fc56ccf 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/cbsnews.py +++ b/plugin/yt-dlp/yt_dlp/extractor/cbsnews.py @@ -1,36 +1,153 @@ +import base64 import re +import urllib.error +import urllib.parse import zlib +from .anvato import AnvatoIE from .common import InfoExtractor -from .cbs import CBSIE -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, -) +from .paramountplus import ParamountPlusIE from ..utils import ( + ExtractorError, + HEADRequest, + UserNotLive, + determine_ext, + float_or_none, + format_field, + int_or_none, + make_archive_id, + mimetype2ext, parse_duration, + smuggle_url, + traverse_obj, + url_or_none, ) -class CBSNewsEmbedIE(CBSIE): # XXX: Do not subclass from concrete IE +class CBSNewsBaseIE(InfoExtractor): + _LOCALES = { + 'atlanta': None, + 'baltimore': 'BAL', + 'boston': 'BOS', + 'chicago': 'CHI', + 'colorado': 'DEN', + 'detroit': 'DET', + 'losangeles': 'LA', + 'miami': 'MIA', + 'minnesota': 'MIN', + 'newyork': 'NY', + 'philadelphia': 'PHI', + 'pittsburgh': 'PIT', + 'sacramento': 'SAC', + 'sanfrancisco': 'SF', + 'texas': 'DAL', + } + _LOCALE_RE = '|'.join(map(re.escape, _LOCALES)) + _ANVACK = '5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl' + + def _get_item(self, webpage, display_id): + return traverse_obj(self._search_json( + r'CBSNEWS\.defaultPayload\s*=', webpage, 'payload', display_id, + default={}), ('items', 0, {dict})) or {} + + def _get_video_url(self, item): + return traverse_obj(item, 'video', 'video2', expected_type=url_or_none) + + def _extract_playlist(self, webpage, playlist_id): + entries = [self.url_result(embed_url, CBSNewsEmbedIE) for embed_url in re.findall( + r']+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage)] + if entries: + return self.playlist_result( + entries, playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage), + self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + + def _extract_video(self, item, video_url, video_id): + if mimetype2ext(item.get('format'), default=determine_ext(video_url)) == 'mp4': + formats = [{'url': video_url, 'ext': 'mp4'}] + + else: + manifest = self._download_webpage(video_url, video_id, note='Downloading m3u8 information') + + anvato_id = self._search_regex(r'anvato-(\d+)', manifest, 'Anvato ID', default=None) + # Prefer Anvato if available; cbsnews.com m3u8 formats are re-encoded from Anvato source + if anvato_id: + return self.url_result( + smuggle_url(f'anvato:{self._ANVACK}:{anvato_id}', {'token': 'default'}), + AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + formats, _ = self._parse_m3u8_formats_and_subtitles( + manifest, video_url, 'mp4', m3u8_id='hls', video_id=video_id) + + def get_subtitles(subs_url): + return { + 'en': [{ + 'url': subs_url, + 'ext': 'dfxp', # TTAF1 + }], + } if url_or_none(subs_url) else None + + episode_meta = traverse_obj(item, { + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + }) if item.get('isFullEpisode') else {} + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(item, { + 'title': (None, ('fulltitle', 'title')), + 'description': 'dek', + 'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}), + 'duration': ('duration', {float_or_none}), + 'subtitles': ('captions', {get_subtitles}), + 'thumbnail': ('images', ('hd', 'sd'), {url_or_none}), + 'is_live': ('type', {lambda x: x == 'live'}), + }, get_all=False), + **episode_meta, + } + + +class CBSNewsEmbedIE(CBSNewsBaseIE): IE_NAME = 'cbsnews:embed' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P.+)' _TESTS = [{ 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', - 'only_matching': True, + 'info_dict': { + 'id': '6ZP4cXvo9FaX3VLH7MF4CgY30JFpY_GA', + 'ext': 'mp4', + 'title': 'Cops investigate gorilla incident at Cincinnati Zoo', + 'description': 'md5:fee7441ab8aaeb3c693482394738102b', + 'duration': 350, + 'timestamp': 1464719713, + 'upload_date': '20160531', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - item = self._parse_json(zlib.decompress(compat_b64decode( - compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') + item = traverse_obj(self._parse_json(zlib.decompress(base64.b64decode( + urllib.parse.unquote(self._match_id(url))), + -zlib.MAX_WBITS).decode(), None), ('video', 'items', 0, {dict})) or {} + video_id = item['mpxRefId'] + video_url = self._get_video_url(item) + if not video_url: + # Old embeds redirect user to ParamountPlus but most links are 404 + pplus_url = f'https://www.paramountplus.com/shows/video/{video_id}' + try: + self._request_webpage(HEADRequest(pplus_url), video_id) + return self.url_result(pplus_url, ParamountPlusIE) + except ExtractorError: + self.raise_no_formats('This video is no longer available', True, video_id) -class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE + return self._extract_video(item, video_url, video_id) + + +class CBSNewsIE(CBSNewsBaseIE): IE_NAME = 'cbsnews' IE_DESC = 'CBS News' - _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P[\w-]+)' _TESTS = [ { @@ -47,10 +164,7 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'timestamp': 1476046464, 'upload_date': '20161009', }, - 'params': { - # rtmp download - 'skip_download': True, - }, + 'skip': 'This video is no longer available', }, { 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -61,48 +175,234 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', 'upload_date': '20140404', 'timestamp': 1396650660, - 'uploader': 'CBSI-NEW', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { 'en': [{ - 'ext': 'ttml', + 'ext': 'dfxp', }], }, }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { # 48 hours 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', 'info_dict': { + 'id': 'maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved', 'title': 'Cold as Ice', 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', }, 'playlist_mincount': 7, }, + { + 'url': 'https://www.cbsnews.com/video/032823-cbs-evening-news/', + 'info_dict': { + 'id': '_2wuO7hD9LwtyM_TwSnVwnKp6kxlcXgE', + 'ext': 'mp4', + 'title': 'CBS Evening News, March 28, 2023', + 'description': 'md5:db20615aae54adc1d55a1fd69dc75d13', + 'duration': 1189, + 'timestamp': 1680042600, + 'upload_date': '20230328', + 'season': 'Season 2023', + 'season_number': 2023, + 'episode': 'Episode 83', + 'episode_number': 83, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + video_url = self._get_video_url(item) + if not video_url: + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + return self._extract_video(item, video_url, video_id) + + +class CBSLocalBaseIE(CBSNewsBaseIE): + def _real_extract(self, url): + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - entries = [] - for embed_url in re.findall(r']+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): - entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) - if entries: - return self.playlist_result( - entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), - playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + anvato_id = None + video_url = self._get_video_url(item) + + if not video_url: + anv_params = self._search_regex( + r']+\bdata-src="https?://w3\.mp\.lura\.live/player/prod/v3/anvload\.html\?key=([^"]+)"', + webpage, 'Anvato URL', default=None) + + if not anv_params: + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + anv_data = self._parse_json(base64.urlsafe_b64decode(f'{anv_params}===').decode(), video_id) + anvato_id = anv_data['v'] + return self.url_result( + smuggle_url(f'anvato:{anv_data.get("anvack") or self._ANVACK}:{anvato_id}', { + 'token': anv_data.get('token') or 'default', + }), AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + return self._extract_video(item, video_url, video_id) + - item = self._parse_json(self._html_search_regex( - r'CBSNEWS\.defaultPayload\s*=\s*({.+})', - webpage, 'video JSON info'), display_id)['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') +class CBSLocalIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/(?:live/)?video/(?P[\w-]+)' + _TESTS = [{ + # Anvato video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/video/1st-cannabis-dispensary-opens-in-queens/', + 'info_dict': { + 'id': '6376747', + 'ext': 'mp4', + 'title': '1st cannabis dispensary opens in Queens', + 'description': 'The dispensary is women-owned and located in Jamaica.', + 'uploader': 'CBS', + 'duration': 20, + 'timestamp': 1680193657, + 'upload_date': '20230330', + 'categories': ['Stations\\Spoken Word\\WCBSTV', 'Content\\Google', 'Content\\News', 'Content\\News\\Local News'], + 'tags': 'count:11', + 'thumbnail': 're:^https?://.*', + '_old_archive_ids': ['cbslocal 6376747'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # cbsnews.com video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/live/video/20230330171655-the-city-is-sounding-the-alarm-on-dangerous-social-media-challenges/', + 'info_dict': { + 'id': 'sJqfw7YvgSC6ant2zVmzt3y1jYKoL5J3', + 'ext': 'mp4', + 'title': 'the city is sounding the alarm on dangerous social media challenges', + 'description': 'md5:8eccc9b1b73be5138a52e9c4350d2cd6', + 'thumbnail': 'https://images-cbsn.cbsnews.com/prod/2023/03/30/story_22509622_1680196925.jpg', + 'duration': 41.0, + 'timestamp': 1680196615, + 'upload_date': '20230330', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + +class CBSLocalArticleIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/news/(?P[\w-]+)' + _TESTS = [{ + # Anvato video via iframe embed + 'url': 'https://www.cbsnews.com/newyork/news/mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service/', + 'playlist_count': 2, + 'info_dict': { + 'id': 'mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service', + 'title': 'MTA station agents begin leaving their booths to provide more direct customer service', + 'description': 'The more than 2,200 agents will provide face-to-face customer service to passengers.', + }, + }, { + 'url': 'https://www.cbsnews.com/losangeles/news/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis/', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + }, + 'skip': 'Video has been removed', + }] + + +class CBSNewsLiveBaseIE(CBSNewsBaseIE): + def _get_id(self, url): + raise NotImplementedError('This method must be implemented by subclasses') + + def _real_extract(self, url): + video_id = self._get_id(url) + if not video_id: + raise ExtractorError('Livestream is not available', expected=True) + + data = traverse_obj(self._download_json( + 'https://feeds-cbsn.cbsnews.com/2.0/rundown/', video_id, query={ + 'partner': 'cbsnsite', + 'edition': video_id, + 'type': 'live', + }), ('navigation', 'data', 0, {dict})) + + video_url = traverse_obj(data, (('videoUrlDAI', ('videoUrl', 'base')), {url_or_none}), get_all=False) + if not video_url: + raise UserNotLive(video_id=video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(data, { + 'title': 'headline', + 'description': 'rundown_slug', + 'thumbnail': ('images', 'thumbnail_url_hd', {url_or_none}), + }), + } + + +class CBSLocalLiveIE(CBSNewsLiveBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?P{CBSNewsBaseIE._LOCALE_RE})/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/losangeles/live/', + 'info_dict': { + 'id': 'CBSN-LA', + 'ext': 'mp4', + 'title': str, + 'description': r're:KCBS/CBSN_LA.CRISPIN.\w+.RUNDOWN \w+ \w+', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return format_field(self._LOCALES, self._match_id(url), 'CBSN-%s') + + +class CBSNewsLiveIE(CBSNewsLiveBaseIE): + IE_NAME = 'cbsnews:live' + IE_DESC = 'CBS News Livestream' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/live/', + 'info_dict': { + 'id': 'CBSN-US', + 'ext': 'mp4', + 'title': str, + 'description': r're:\w+ \w+ CRISPIN RUNDOWN', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return 'CBSN-US' class CBSNewsLiveVideoIE(InfoExtractor): @@ -111,7 +411,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[^/?#]+)' # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples - _TEST = { + _TESTS = [{ 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', @@ -120,7 +420,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'duration': 334, }, 'skip': 'Video gone', - } + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -131,13 +431,13 @@ def _real_extract(self, url): 'dvr_slug': display_id, }) - formats = self._extract_akamai_formats(video_info['url'], display_id) - return { 'id': display_id, 'display_id': display_id, - 'title': video_info['headline'], - 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), - 'duration': parse_duration(video_info.get('segmentDur')), - 'formats': formats, + 'formats': self._extract_akamai_formats(video_info['url'], display_id), + **traverse_obj(video_info, { + 'title': 'headline', + 'thumbnail': ('thumbnail_url_hd', {url_or_none}), + 'duration': ('segmentDur', {parse_duration}), + }), } diff --git a/plugin/yt-dlp/yt_dlp/extractor/ciscowebex.py b/plugin/yt-dlp/yt_dlp/extractor/ciscowebex.py index 14d940c..a4505aa 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/ciscowebex.py +++ b/plugin/yt-dlp/yt_dlp/extractor/ciscowebex.py @@ -49,7 +49,7 @@ def _real_extract(self, url): 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id), video_id, headers=headers, query={'siteurl': siteurl}, expected_status=(403, 429)) - if urlh.status == 403: + if urlh.getcode() == 403: if stream['code'] == 53004: self.raise_login_required() if stream['code'] == 53005: @@ -59,7 +59,7 @@ def _real_extract(self, url): 'This video is protected by a password, use the --video-password option', expected=True) raise ExtractorError(f'{self.IE_NAME} said: {stream["code"]} - {stream["message"]}', expected=True) - if urlh.status == 429: + if urlh.getcode() == 429: self.raise_login_required( f'{self.IE_NAME} asks you to solve a CAPTCHA. Solve CAPTCHA in browser and', method='cookies') diff --git a/plugin/yt-dlp/yt_dlp/extractor/comedycentral.py b/plugin/yt-dlp/yt_dlp/extractor/comedycentral.py index 3d7912a..3fde65c 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/comedycentral.py +++ b/plugin/yt-dlp/yt_dlp/extractor/comedycentral.py @@ -2,7 +2,7 @@ class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P[0-9a-z]{6})' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist|movies)/(?P[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ @@ -25,6 +25,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }, { 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb', 'only_matching': True, + }, { + 'url': 'https://www.cc.com/movies/tkp406/a-cluesterfuenke-christmas', + 'only_matching': True, }] diff --git a/plugin/yt-dlp/yt_dlp/extractor/common.py b/plugin/yt-dlp/yt_dlp/extractor/common.py index 261ecfd..2dab438 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/common.py +++ b/plugin/yt-dlp/yt_dlp/extractor/common.py @@ -13,9 +13,11 @@ import os import random import re +import subprocess import sys import time import types +import urllib.error import urllib.parse import urllib.request import xml.etree.ElementTree @@ -34,6 +36,7 @@ GeoUtils, HEADRequest, LenientJSONDecoder, + Popen, RegexNotFoundError, RetryManager, UnsupportedError, @@ -56,6 +59,7 @@ join_nonempty, js_to_json, mimetype2ext, + netrc_from_content, network_exceptions, orderedSet, parse_bitrate, @@ -286,6 +290,7 @@ class InfoExtractor: channel_id: Id of the channel. channel_url: Full URL to a channel webpage. channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -314,6 +319,11 @@ class InfoExtractor: * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment @@ -325,8 +335,8 @@ class InfoExtractor: * "dislike_count" - Number of negative ratings of the comment * "is_favorited" - Whether the comment is marked as favorite by the video uploader - * "author_is_uploader" - Whether the comment is made by - the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to yt-dlp it should allow to get the same result again. (It will be set @@ -350,6 +360,10 @@ class InfoExtractor: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string @@ -525,7 +539,7 @@ class InfoExtractor: _EMBED_REGEX = [] def _login_hint(self, method=NO_DEFAULT, netrc=None): - password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' return { None: '', 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', @@ -1281,45 +1295,48 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr return clean_html(res) def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None netrc_machine = netrc_machine or self._NETRC_MACHINE - if self.get_param('usenetrc', False): - try: - netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') - if os.path.isdir(netrc_file): - netrc_file = os.path.join(netrc_file, '.netrc') - info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (OSError, netrc.NetrcParseError) as err: - self.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) + cmd = self.get_param('netrc_cmd') + if cmd: + cmd = cmd.replace('{}', netrc_machine) + self.to_screen(f'Executing command: {cmd}') + stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE) + if ret != 0: + raise OSError(f'Command returned error code {ret}') + info = netrc_from_content(stdout).authenticators(netrc_machine) + + elif self.get_param('usenetrc', False): + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(netrc_file).authenticators(netrc_machine) - return username, password + else: + return None, None + if not info: + raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}') + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) First look for the manually specified credentials using username_option and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. + are available try the netrc_cmd if it is defined or look in the + netrc file using the netrc_machine or _NETRC_MACHINE value. If there's no info available, return (None, None) """ - # Attempt to use provided username and password or .netrc data username = self.get_param(username_option) if username is not None: password = self.get_param(password_option) else: - username, password = self._get_netrc_login_info(netrc_machine) - + try: + username, password = self._get_netrc_login_info(netrc_machine) + except (OSError, netrc.NetrcParseError) as err: + self.report_warning(f'Failed to parse .netrc: {err}') + return None, None return username, password def _get_tfa_info(self, note='two-factor verification code'): @@ -3440,7 +3457,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None, def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return LenientSimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ diff --git a/plugin/yt-dlp/yt_dlp/extractor/crtvg.py b/plugin/yt-dlp/yt_dlp/extractor/crtvg.py new file mode 100644 index 0000000..1aa8d77 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/crtvg.py @@ -0,0 +1,34 @@ +from .common import InfoExtractor +from ..utils import remove_end + + +class CrtvgIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crtvg\.es/tvg/a-carta/[^/#?]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.crtvg.es/tvg/a-carta/os-caimans-do-tea-5839623', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': '5839623', + 'title': 'Os caimáns do Tea', + 'ext': 'mp4', + 'description': 'md5:f71cfba21ae564f0a6f415b31de1f842', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex(r'var\s+url\s*=\s*["\']([^"\']+)', webpage, 'video url') + formats = self._extract_m3u8_formats(video_url + '/playlist.m3u8', video_id, fatal=False) + formats.extend(self._extract_mpd_formats(video_url + '/manifest.mpd', video_id, fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + 'title': remove_end(self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None), ' | CRTVG'), + 'description': self._html_search_meta('description', webpage, 'description', default=None), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=None), + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/crunchyroll.py b/plugin/yt-dlp/yt_dlp/extractor/crunchyroll.py index 4a82170..ca3c0e2 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/crunchyroll.py +++ b/plugin/yt-dlp/yt_dlp/extractor/crunchyroll.py @@ -1,28 +1,37 @@ import base64 -import urllib.parse +import urllib.error from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, format_field, + int_or_none, join_nonempty, + parse_age_limit, + parse_count, parse_iso8601, qualities, + remove_start, + time_seconds, traverse_obj, - try_get, + url_or_none, + urlencode_postdata, ) class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - params = None + _AUTH_HEADERS = None + _API_ENDPOINT = None + _BASIC_AUTH = None + _QUERY = {} @property def is_logged_in(self): - return self._get_cookies(self._LOGIN_URL).get('etp_rt') + return self._get_cookies(self._BASE_URL).get('etp_rt') def _perform_login(self, username, password): if self.is_logged_in: @@ -35,7 +44,7 @@ def _perform_login(self, username, password): 'device_id': 'whatvalueshouldbeforweb', 'device_type': 'com.crunchyroll.static', 'access_token': 'giKq5eY27ny3cqz', - 'referer': self._LOGIN_URL + 'referer': f'{self._BASE_URL}/welcome/login' }) if upsell_response['code'] != 'ok': raise ExtractorError('Could not get session id') @@ -43,66 +52,164 @@ def _perform_login(self, username, password): login_response = self._download_json( f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=urllib.parse.urlencode({ + data=urlencode_postdata({ 'account': username, 'password': password, 'session_id': session_id - }).encode('ascii')) + })) if login_response['code'] != 'ok': raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) if not self.is_logged_in: raise ExtractorError('Login succeeded but did not set etp_rt cookie') - def _get_embedded_json(self, webpage, display_id): - initial_state = self._parse_json(self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) - app_config = self._parse_json(self._search_regex( - r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) - return initial_state, app_config - - def _get_params(self, lang): - if not CrunchyrollBaseIE.params: - if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): - grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' + def _update_query(self, lang): + if lang in CrunchyrollBaseIE._QUERY: + return + + webpage = self._download_webpage( + f'{self._BASE_URL}/{lang}', None, note=f'Retrieving main page (lang={lang or None})') + + initial_state = self._search_json(r'__INITIAL_STATE__\s*=', webpage, 'initial state', None) + CrunchyrollBaseIE._QUERY[lang] = traverse_obj(initial_state, { + 'locale': ('localization', 'locale'), + }) or None + + if CrunchyrollBaseIE._BASIC_AUTH: + return + + app_config = self._search_json(r'__APP_CONFIG__\s*=', webpage, 'app config', None) + cx_api_param = app_config['cxApiParams']['accountAuthClientId' if self.is_logged_in else 'anonClientId'] + self.write_debug(f'Using cxApiParam={cx_api_param}') + CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() + + def _update_auth(self): + if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds(): + return + + assert CrunchyrollBaseIE._BASIC_AUTH, '_update_query needs to be called at least one time beforehand' + grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id' + auth_response = self._download_json( + f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', + headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) + + CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} + CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) + + def _call_base_api(self, endpoint, internal_id, lang, note=None, query={}): + self._update_query(lang) + self._update_auth() + + if not endpoint.startswith('/'): + endpoint = f'/{endpoint}' + + return self._download_json( + f'{self._BASE_URL}{endpoint}', internal_id, note or f'Calling API: {endpoint}', + headers=CrunchyrollBaseIE._AUTH_HEADERS, query={**CrunchyrollBaseIE._QUERY[lang], **query}) + + def _call_api(self, path, internal_id, lang, note='api', query={}): + if not path.startswith(f'/content/v2/{self._API_ENDPOINT}/'): + path = f'/content/v2/{self._API_ENDPOINT}/{path}' + + try: + result = self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON ({self._API_ENDPOINT})', query=query) + except ExtractorError as error: + if isinstance(error.cause, urllib.error.HTTPError) and error.cause.code == 404: + return None + raise + + if not result: + raise ExtractorError(f'Unexpected response when downloading {note} JSON') + return result + + def _extract_formats(self, stream_response, display_id=None): + requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + available_formats = {} + for stream_type, streams in traverse_obj( + stream_response, (('streams', ('data', 0)), {dict.items}, ...)): + if stream_type not in requested_formats: + continue + for stream in traverse_obj(streams, lambda _, v: v['url']): + hardsub_lang = stream.get('hardsub_locale') or '' + format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) + available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + if '' in available_formats and 'all' not in requested_hardsubs: + full_format_langs = set(requested_hardsubs) + self.to_screen( + 'To get all formats of a hardsub language, use ' + '"--extractor-args crunchyrollbeta:hardsub=". ' + 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info', + only_once=True) + else: + full_format_langs = set(map(str.lower, available_formats)) + + audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False) + hardsub_preference = qualities(requested_hardsubs[::-1]) + formats = [] + for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): + if stream_type.endswith('hls'): + if hardsub_lang.lower() in full_format_langs: + adaptive_formats = self._extract_m3u8_formats( + stream_url, display_id, 'mp4', m3u8_id=format_id, + fatal=False, note=f'Downloading {format_id} HLS manifest') + else: + adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) + elif stream_type.endswith('dash'): + adaptive_formats = self._extract_mpd_formats( + stream_url, display_id, mpd_id=format_id, + fatal=False, note=f'Downloading {format_id} MPD manifest') else: - grant_type, key = 'client_id', 'anonClientId' - - initial_state, app_config = self._get_embedded_json(self._download_webpage( - f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') - - auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={ - 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') - }, data=f'grant_type={grant_type}'.encode('ascii')) - policy_response = self._download_json( - f'{api_domain}/index/v2', None, note='Retrieving signed policy', - headers={ - 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] - }) - cms = policy_response.get('cms_web') - bucket = cms['bucket'] - params = { - 'Policy': cms['policy'], - 'Signature': cms['signature'], - 'Key-Pair-Id': cms['key_pair_id'] + self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) + continue + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_locale + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + + return formats + + def _extract_subtitles(self, data): + subtitles = {} + + for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)): + subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})] + + return subtitles + + +class CrunchyrollCmsBaseIE(CrunchyrollBaseIE): + _API_ENDPOINT = 'cms' + _CMS_EXPIRY = None + + def _call_cms_api_signed(self, path, internal_id, lang, note='api'): + if not CrunchyrollCmsBaseIE._CMS_EXPIRY or CrunchyrollCmsBaseIE._CMS_EXPIRY <= time_seconds(): + response = self._call_base_api('index/v2', None, lang, 'Retrieving signed policy')['cms_web'] + CrunchyrollCmsBaseIE._CMS_QUERY = { + 'Policy': response['policy'], + 'Signature': response['signature'], + 'Key-Pair-Id': response['key_pair_id'], } - locale = traverse_obj(initial_state, ('localization', 'locale')) - if locale: - params['locale'] = locale - CrunchyrollBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBaseIE.params + CrunchyrollCmsBaseIE._CMS_BUCKET = response['bucket'] + CrunchyrollCmsBaseIE._CMS_EXPIRY = parse_iso8601(response['expires']) - 10 + + if not path.startswith('/cms/v2'): + path = f'/cms/v2{CrunchyrollCmsBaseIE._CMS_BUCKET}/{path}' + return self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON (signed cms)', query=CrunchyrollCmsBaseIE._CMS_QUERY) -class CrunchyrollBetaIE(CrunchyrollBaseIE): + +class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ + https?://(?:beta\.|www\.)?crunchyroll\.com/ (?P(?:\w{2}(?:-\w{2})?/)?) - watch/(?P\w+) - (?:/(?P[\w-]+))?/?(?:[?#]|$)''' + watch/(?!concert|musicvideo)(?P\w+)''' _TESTS = [{ + # Premium only 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { 'id': 'GY2P1Q98Y', @@ -119,11 +226,15 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'season_number': 1, 'episode': 'To the Future', 'episode_number': 73, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'chapters': 'count:2', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, }, 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, }, { + # Premium only 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', 'info_dict': { 'id': 'GYE5WKQGR', @@ -131,7 +242,7 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'duration': 366.459, 'timestamp': 1476788400, 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', - 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', + 'title': 'SHELTER – Porter Robinson presents Shelter the Animation', 'upload_date': '20161018', 'series': 'SHELTER', 'series_id': 'GYGG09WWY', @@ -140,11 +251,58 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'season_number': 1, 'episode': 'Porter Robinson presents Shelter the Animation', 'episode_number': 0, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', - 'chapters': 'count:0', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, }, 'params': {'skip_download': True}, - 'skip': 'Video is Premium only', + }, { + 'url': 'https://www.crunchyroll.com/watch/GJWU2VKK3/cherry-blossom-meeting-and-a-coming-blizzard', + 'info_dict': { + 'id': 'GJWU2VKK3', + 'ext': 'mp4', + 'duration': 1420.054, + 'description': 'md5:2d1c67c0ec6ae514d9c30b0b99a625cd', + 'title': 'The Ice Guy and His Cool Female Colleague Episode 1 – Cherry Blossom Meeting and a Coming Blizzard', + 'series': 'The Ice Guy and His Cool Female Colleague', + 'series_id': 'GW4HM75NP', + 'season': 'The Ice Guy and His Cool Female Colleague', + 'season_id': 'GY9PC21VE', + 'season_number': 1, + 'episode': 'Cherry Blossom Meeting and a Coming Blizzard', + 'episode_number': 1, + 'chapters': 'count:2', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'timestamp': 1672839000, + 'upload_date': '20230104', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GM8F313NQ', + 'info_dict': { + 'id': 'GM8F313NQ', + 'ext': 'mp4', + 'title': 'Garakowa -Restore the World-', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'duration': 3996.104, + 'age_limit': 13, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6', + 'info_dict': { + 'id': 'G62PEZ2E6', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'age_limit': 13, + 'duration': 65.138, + 'title': 'Garakowa -Restore the World-', + }, + 'playlist_mincount': 5, }, { 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', 'only_matching': True, @@ -152,125 +310,147 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', 'only_matching': True, }] + # We want to support lazy playlist filtering and movie listings cannot be inside a playlist + _RETURN_TYPE = 'video' def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') - episode_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', query=params) - if episode_response.get('is_premium_only') and not bucket.endswith('crunchyroll'): - if self.is_logged_in: - raise ExtractorError('This video is for premium members only', expected=True) - else: - self.raise_login_required('This video is for premium members only') + # We need to use unsigned API call to allow ratings query string + response = traverse_obj(self._call_api( + f'objects/{internal_id}', internal_id, lang, 'object info', {'ratings': 'true'}), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) - stream_response = self._download_json( - f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, - note='Retrieving stream info', query=params) - get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() + object_type = response.get('type') + if object_type == 'episode': + result = self._transform_episode_response(response) - requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] - hardsub_preference = qualities(requested_hardsubs[::-1]) - requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + elif object_type == 'movie': + result = self._transform_movie_response(response) - available_formats = {} - for stream_type, streams in get_streams('streams'): - if stream_type not in requested_formats: - continue - for stream in streams.values(): - if not stream.get('url'): - continue - hardsub_lang = stream.get('hardsub_locale') or '' - format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) - available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + elif object_type == 'movie_listing': + first_movie_id = traverse_obj(response, ('movie_listing_metadata', 'first_movie_id')) + if not self._yes_playlist(internal_id, first_movie_id): + return self.url_result(f'{self._BASE_URL}/{lang}watch/{first_movie_id}', CrunchyrollBetaIE, first_movie_id) + + def entries(): + movies = self._call_api(f'movie_listings/{internal_id}/movies', internal_id, lang, 'movie list') + for movie_response in traverse_obj(movies, ('data', ...)): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{movie_response["id"]}', + CrunchyrollBetaIE, **self._transform_movie_response(movie_response)) + + return self.playlist_result(entries(), **self._transform_movie_response(response)) - if '' in available_formats and 'all' not in requested_hardsubs: - full_format_langs = set(requested_hardsubs) - self.to_screen( - 'To get all formats of a hardsub language, use ' - '"--extractor-args crunchyrollbeta:hardsub=". ' - 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info', - only_once=True) else: - full_format_langs = set(map(str.lower, available_formats)) + raise ExtractorError(f'Unknown object type {object_type}') - formats = [] - for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): - if stream_type.endswith('hls'): - if hardsub_lang.lower() in full_format_langs: - adaptive_formats = self._extract_m3u8_formats( - stream_url, display_id, 'mp4', m3u8_id=format_id, - fatal=False, note=f'Downloading {format_id} HLS manifest') - else: - adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) - elif stream_type.endswith('dash'): - adaptive_formats = self._extract_mpd_formats( - stream_url, display_id, mpd_id=format_id, - fatal=False, note=f'Downloading {format_id} MPD manifest') - else: - self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) - continue - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') - f['quality'] = hardsub_preference(hardsub_lang.lower()) - formats.extend(adaptive_formats) + # There might be multiple audio languages for one object (`_metadata.versions`), + # so we need to get the id from `streams_link` instead or we dont know which language to choose + streams_link = response.get('streams_link') + if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): + message = f'This {object_type} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + # We need go from unsigned to signed api to avoid getting soft banned + stream_response = self._call_cms_api_signed(remove_start( + streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + result['subtitles'] = self._extract_subtitles(stream_response) - chapters = None # if no intro chapter is available, a 403 without usable data is returned - intro_chapter = self._download_json(f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', - display_id, fatal=False, errnote=False) + intro_chapter = self._download_json( + f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', + internal_id, note='Downloading chapter info', fatal=False, errnote=False) if isinstance(intro_chapter, dict): - chapters = [{ + result['chapters'] = [{ 'title': 'Intro', 'start_time': float_or_none(intro_chapter.get('startTime')), - 'end_time': float_or_none(intro_chapter.get('endTime')) + 'end_time': float_or_none(intro_chapter.get('endTime')), }] + def calculate_count(item): + return parse_count(''.join((item['displayed'], item.get('unit') or ''))) + + result.update(traverse_obj(response, ('rating', { + 'like_count': ('up', {calculate_count}), + 'dislike_count': ('down', {calculate_count}), + }))) + + return result + + @staticmethod + def _transform_episode_response(data): + metadata = traverse_obj(data, (('episode_metadata', None), {dict}), get_all=False) or {} return { - 'id': internal_id, - 'title': '%s Episode %s – %s' % ( - episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'timestamp': parse_iso8601(episode_response.get('upload_date')), - 'series': episode_response.get('series_title'), - 'series_id': episode_response.get('series_id'), - 'season': episode_response.get('season_title'), - 'season_id': episode_response.get('season_id'), - 'season_number': episode_response.get('season_number'), - 'episode': episode_response.get('title'), - 'episode_number': episode_response.get('sequence_number'), - 'formats': formats, - 'thumbnails': [{ - 'url': thumb.get('source'), - 'width': thumb.get('width'), - 'height': thumb.get('height'), - } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], - 'subtitles': { - lang: [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] for lang, subtitle_data in get_streams('subtitles') - }, - 'chapters': chapters + 'id': data['id'], + 'title': ' \u2013 '.join(( + ('%s%s' % ( + format_field(metadata, 'season_title'), + format_field(metadata, 'episode', ' Episode %s'))), + format_field(data, 'title'))), + **traverse_obj(data, { + 'episode': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'timestamp': ('upload_date', {parse_iso8601}), + 'series': ('series_title', {str}), + 'series_id': ('series_id', {str}), + 'season': ('season_title', {str}), + 'season_id': ('season_id', {str}), + 'season_number': ('season_number', ({int}, {float_or_none})), + 'episode_number': ('sequence_number', ({int}, {float_or_none})), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'language': ('audio_locale', {str}), + }, get_all=False), } + @staticmethod + def _transform_movie_response(data): + metadata = traverse_obj(data, (('movie_metadata', 'movie_listing_metadata', None), {dict}), get_all=False) or {} + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), + } -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + +class CrunchyrollBetaShowIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll:playlist' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ + https?://(?:beta\.|www\.)?crunchyroll\.com/ (?P(?:\w{2}(?:-\w{2})?/)?) - series/(?P\w+) - (?:/(?P[\w-]+))?/?(?:[?#]|$)''' + series/(?P\w+)''' _TESTS = [{ 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', + 'description': 'md5:99c1b22ee30a74b536a8277ced8eb750', + # XXX: `thumbnail` does not get set from `thumbnails` in playlist + # 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, }, 'playlist_mincount': 10, }, { @@ -279,41 +459,163 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + + def entries(): + seasons_response = self._call_cms_api_signed(f'seasons?series_id={internal_id}', internal_id, lang, 'seasons') + for season in traverse_obj(seasons_response, ('items', ..., {dict})): + episodes_response = self._call_cms_api_signed( + f'episodes?season_id={season["id"]}', season["id"], lang, 'episode list') + for episode_response in traverse_obj(episodes_response, ('items', ..., {dict})): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{episode_response["id"]}', + CrunchyrollBetaIE, **CrunchyrollBetaIE._transform_episode_response(episode_response)) + + return self.playlist_result( + entries(), internal_id, + **traverse_obj(self._call_api(f'series/{internal_id}', internal_id, lang, 'series'), ('data', 0, { + 'title': ('title', {str}), + 'description': ('description', {lambda x: x.replace(r'\r\n', '\n')}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'thumbnails': ('images', ..., ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }) + }))) + + +class CrunchyrollMusicIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:music' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + watch/(?Pconcert|musicvideo)/(?P\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV88BB7F2C', + 'display_id': 'crossing-field', + 'title': 'Crossing Field', + 'track': 'Crossing Field', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['Anime'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MC2E2AC135', + 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena', + 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'description': 'md5:747444e7e6300907b7a43f0a0503072e', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id, object_type = self._match_valid_url(url).group('lang', 'id', 'type') + path, name = { + 'concert': ('concerts', 'concert info'), + 'musicvideo': ('music_videos', 'music video info'), + }[object_type] + response = traverse_obj(self._call_api(f'{path}/{internal_id}', internal_id, lang, name), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) - series_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, - note='Retrieving series metadata', query=params) + streams_link = response.get('streams_link') + if not streams_link and response.get('isPremiumOnly'): + message = f'This {response.get("type") or "media"} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + result = self._transform_music_response(response) + stream_response = self._call_api(streams_link, internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + + return result + + @staticmethod + def _transform_music_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'display_id': 'slug', + 'title': 'title', + 'track': 'title', + 'artist': ('artist', 'name'), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), + } - seasons_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, - note='Retrieving season list', query=params) + +class CrunchyrollArtistIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:artist' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + artist/(?P\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D', + 'info_dict': { + 'id': 'MA179CB50D', + 'title': 'LiSA', + 'genre': ['J-Pop', 'Anime', 'Rock'], + 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', + }, + 'playlist_mincount': 83, + }, { + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D/lisa', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + response = traverse_obj(self._call_api( + f'artists/{internal_id}', internal_id, lang, 'artist info'), ('data', 0)) def entries(): - for season in seasons_response['items']: - episodes_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, - note=f'Retrieving episode list for {season.get("slug_title")}', query=params) - for episode in episodes_response['items']: - episode_id = episode['id'] - episode_display_id = episode['slug_title'] - yield { - '_type': 'url', - 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', - 'ie_key': CrunchyrollBetaIE.ie_key(), - 'id': episode_id, - 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), - 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode.get('duration_ms'), 1000), - 'series': episode.get('series_title'), - 'series_id': episode.get('series_id'), - 'season': episode.get('season_title'), - 'season_id': episode.get('season_id'), - 'season_number': episode.get('season_number'), - 'episode': episode.get('title'), - 'episode_number': episode.get('sequence_number'), - 'language': episode.get('audio_locale'), - } - - return self.playlist_result(entries(), internal_id, series_response.get('title')) + for attribute, path in [('concerts', 'concert'), ('videos', 'musicvideo')]: + for internal_id in traverse_obj(response, (attribute, ...)): + yield self.url_result(f'{self._BASE_URL}/watch/{path}/{internal_id}', CrunchyrollMusicIE, internal_id) + + return self.playlist_result(entries(), **self._transform_artist_response(response)) + + @staticmethod + def _transform_artist_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': 'name', + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + }), + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/dacast.py b/plugin/yt-dlp/yt_dlp/extractor/dacast.py new file mode 100644 index 0000000..cf683ba --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/dacast.py @@ -0,0 +1,158 @@ +import hashlib +import re +import time +import urllib.error + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + classproperty, + float_or_none, + traverse_obj, + url_or_none, +) + + +class DacastBaseIE(InfoExtractor): + _URL_TYPE = None + + @classproperty + def _VALID_URL(cls): + return fr'https?://iframe\.dacast\.com/{cls._URL_TYPE}/(?P[\w-]+)/(?P[\w-]+)' + + @classproperty + def _EMBED_REGEX(cls): + return [rf']+\bsrc=["\'](?P{cls._VALID_URL})'] + + _API_INFO_URL = 'https://playback.dacast.com/content/info' + + @classmethod + def _get_url_from_id(cls, content_id): + user_id, media_id = content_id.split(f'-{cls._URL_TYPE}-') + return f'https://iframe.dacast.com/{cls._URL_TYPE}/{user_id}/{media_id}' + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for content_id in re.findall( + rf']+\bsrc=["\']https://player\.dacast\.com/js/player\.js\?contentId=([\w-]+-{cls._URL_TYPE}-[\w-]+)["\']', webpage): + yield cls._get_url_from_id(content_id) + + +class DacastVODIE(DacastBaseIE): + _URL_TYPE = 'vod' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/vod/acae82153ef4d7a7344ae4eaa86af534/1c6143e3-5a06-371d-8695-19b96ea49090', + 'info_dict': { + 'id': '1c6143e3-5a06-371d-8695-19b96ea49090', + 'ext': 'mp4', + 'uploader_id': 'acae82153ef4d7a7344ae4eaa86af534', + 'title': '2_4||Adnexal mass characterisation: O-RADS US and MRI||N. Bharwani, London/UK', + 'thumbnail': 'https://universe-files.dacast.com/26137208-5858-65c1-5e9a-9d6b6bd2b6c2', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.dacast.com/support/knowledgebase/how-can-i-embed-a-video-on-my-website/', + 'info_dict': { + 'id': 'b6674869-f08a-23c5-1d7b-81f5309e1a90', + 'ext': 'mp4', + 'title': '4-HowToEmbedVideo.mp4', + 'uploader_id': '3b67c4a9-3886-4eb1-d0eb-39b23b14bef3', + 'thumbnail': 'https://universe-files.dacast.com/d26ab48f-a52a-8783-c42e-a90290ba06b6.png', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://gist.githubusercontent.com/bashonly/4ad249ef2910346fbdf3809b220f11ee/raw/87349778d4af1a80b1fcc3beb9c88108de5858f5/dacast_embeds.html', + 'info_dict': { + 'id': 'e7df418e-a83b-7a7f-7b5e-1a667981e8fa', + 'ext': 'mp4', + 'title': 'Evening Service 2-5-23', + 'uploader_id': '943bb1ab3c03695ba85330d92d6d226e', + 'thumbnail': 'https://universe-files.dacast.com/337472b3-e92c-2ea4-7eb7-5700da477f67', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} + info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False) + access = self._download_json( + 'https://playback.dacast.com/content/access', video_id, + note='Downloading access JSON', query=query, expected_status=403) + + error = access.get('error') + if error in ('Broadcaster has been blocked', 'Content is offline'): + raise ExtractorError(error, expected=True) + elif error: + raise ExtractorError(f'Dacast API says "{error}"') + + hls_url = access['hls'] + hls_aes = {} + + if 'DRM_EXT' in hls_url: + self.report_drm(video_id) + elif '/uspaes/' in hls_url: + # From https://player.dacast.com/js/player.js + ts = int(time.time()) + signature = hashlib.sha1( + f'{10413792000 - ts}{ts}YfaKtquEEpDeusCKbvYszIEZnWmBcSvw').digest().hex() + hls_aes['uri'] = f'https://keys.dacast.com/uspaes/{video_id}.key?s={signature}&ts={ts}' + + for retry in self.RetryManager(): + try: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls') + except ExtractorError as e: + # CDN will randomly respond with 403 + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + retry.error = e + continue + raise + + return { + 'id': video_id, + 'uploader_id': user_id, + 'formats': formats, + 'hls_aes': hls_aes or None, + **traverse_obj(info, ('contentInfo', { + 'title': 'title', + 'duration': ('duration', {float_or_none}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + })), + } + + +class DacastPlaylistIE(DacastBaseIE): + _URL_TYPE = 'playlist' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/playlist/943bb1ab3c03695ba85330d92d6d226e/b632eb053cac17a9c9a02bcfc827f2d8', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://gist.githubusercontent.com/bashonly/7efb606f49f3c6e07ea0327de5a661d1/raw/05a16eac830245ea301fb0a585023bec71e6093c/dacast_playlist_embed.html', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + + def _real_extract(self, url): + user_id, playlist_id = self._match_valid_url(url).group('user_id', 'id') + info = self._download_json( + self._API_INFO_URL, playlist_id, note='Downloading playlist JSON', query={ + 'contentId': f'{user_id}-playlist-{playlist_id}', + 'provider': 'universe', + })['contentInfo'] + + def entries(info): + for video in traverse_obj(info, ('features', 'playlist', 'contents', lambda _, v: v['id'])): + yield self.url_result( + DacastVODIE._get_url_from_id(video['id']), DacastVODIE, video['id'], video.get('title')) + + return self.playlist_result(entries(info), playlist_id, info.get('title')) diff --git a/plugin/yt-dlp/yt_dlp/extractor/daftsex.py b/plugin/yt-dlp/yt_dlp/extractor/daftsex.py index 9a9e426..25ba82a 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/daftsex.py +++ b/plugin/yt-dlp/yt_dlp/extractor/daftsex.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( + ExtractorError, int_or_none, js_to_json, parse_count, @@ -12,21 +13,24 @@ class DaftsexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P-?\d+_\d+)' + _VALID_URL = r'https?://(?:www\.)?daft\.sex/watch/(?P-?\d+_\d+)' _TESTS = [{ - 'url': 'https://daftsex.com/watch/-35370899_456246186', - 'md5': 'd95135e6cea2d905bea20dbe82cda64a', + 'url': 'https://daft.sex/watch/-35370899_456246186', + 'md5': '64c04ef7b4c7b04b308f3b0c78efe7cd', 'info_dict': { 'id': '-35370899_456246186', 'ext': 'mp4', 'title': 'just relaxing', - 'description': 'just relaxing - Watch video Watch video in high quality', + 'description': 'just relaxing – Watch video Watch video in high quality', 'upload_date': '20201113', 'timestamp': 1605261911, - 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'duration': 15.0, + 'view_count': int }, }, { - 'url': 'https://daftsex.com/watch/-156601359_456242791', + 'url': 'https://daft.sex/watch/-156601359_456242791', 'info_dict': { 'id': '-156601359_456242791', 'ext': 'mp4', @@ -36,6 +40,7 @@ class DaftsexIE(InfoExtractor): 'timestamp': 1600250735, 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', }, + 'skip': 'deleted / private' }] def _real_extract(self, url): @@ -60,7 +65,7 @@ def _real_extract(self, url): webpage, 'player color', fatal=False) or '' embed_page = self._download_webpage( - 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color), + 'https://dxb.to/player/%s?color=%s' % (player_hash, player_color), video_id, headers={'Referer': url}) video_params = self._parse_json( self._search_regex( @@ -94,15 +99,19 @@ def _real_extract(self, url): 'age_limit': 18, } - item = self._download_json( + items = self._download_json( f'{server_domain}/method/video.get/{video_id}', video_id, headers={'Referer': url}, query={ 'token': video_params['video']['access_token'], 'videos': video_id, 'ckey': video_params['c_key'], 'credentials': video_params['video']['credentials'], - })['response']['items'][0] + })['response']['items'] + + if not items: + raise ExtractorError('Video is not available', video_id=video_id, expected=True) + item = items[0] formats = [] for f_id, f_url in item.get('files', {}).items(): if f_id == 'external': diff --git a/plugin/yt-dlp/yt_dlp/extractor/digitalconcerthall.py b/plugin/yt-dlp/yt_dlp/extractor/digitalconcerthall.py index ff43dd8..1058c99 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/digitalconcerthall.py +++ b/plugin/yt-dlp/yt_dlp/extractor/digitalconcerthall.py @@ -11,7 +11,7 @@ class DigitalConcertHallIE(InfoExtractor): IE_DESC = 'DigitalConcertHall extractor' - _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/concert/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/(?Pfilm|concert)/(?P[0-9]+)' _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' _ACCESS_TOKEN = None _NETRC_MACHINE = 'digitalconcerthall' @@ -40,6 +40,19 @@ class DigitalConcertHallIE(InfoExtractor): }, 'params': {'skip_download': 'm3u8'}, 'playlist_count': 3, + }, { + 'url': 'https://www.digitalconcerthall.com/en/film/388', + 'info_dict': { + 'id': '388', + 'ext': 'mp4', + 'title': 'The Berliner Philharmoniker and Frank Peter Zimmermann', + 'description': 'md5:cfe25a7044fa4be13743e5089b5b5eb2', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + 'upload_date': '20220714', + 'timestamp': 1657785600, + 'album_artist': 'Frank Peter Zimmermann / Benedikt von Bernstorff / Jakob von Bernstorff', + }, + 'params': {'skip_download': 'm3u8'}, }] def _perform_login(self, username, password): @@ -75,7 +88,7 @@ def _real_initialize(self): if not self._ACCESS_TOKEN: self.raise_login_required(method='password') - def _entries(self, items, language, **kwargs): + def _entries(self, items, language, type_, **kwargs): for item in items: video_id = item['id'] stream_info = self._download_json( @@ -103,11 +116,11 @@ def _entries(self, items, language, **kwargs): 'start_time': chapter.get('time'), 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']), 'title': chapter.get('text'), - } for chapter in item['cuepoints']] if item.get('cuepoints') else None, + } for chapter in item['cuepoints']] if item.get('cuepoints') and type_ == 'concert' else None, } def _real_extract(self, url): - language, video_id = self._match_valid_url(url).group('language', 'id') + language, type_, video_id = self._match_valid_url(url).group('language', 'type', 'id') if not language: language = 'en' @@ -120,18 +133,18 @@ def _real_extract(self, url): }] vid_info = self._download_json( - f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={ + f'https://api.digitalconcerthall.com/v2/{type_}/{video_id}', video_id, headers={ 'Accept': 'application/json', 'Accept-Language': language }) album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '') + videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...)) return { '_type': 'playlist', 'id': video_id, 'title': vid_info.get('title'), - 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language, - thumbnails=thumbnails, album_artist=album_artist), + 'entries': self._entries(videos, language, thumbnails=thumbnails, album_artist=album_artist, type_=type_), 'thumbnails': thumbnails, 'album_artist': album_artist, } diff --git a/plugin/yt-dlp/yt_dlp/extractor/discogs.py b/plugin/yt-dlp/yt_dlp/extractor/discogs.py new file mode 100644 index 0000000..048c622 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/discogs.py @@ -0,0 +1,35 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import traverse_obj + + +class DiscogsReleasePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discogs\.com/(?Prelease|master)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.discogs.com/release/1-The-Persuader-Stockholm', + 'info_dict': { + 'id': 'release1', + 'title': 'Stockholm', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://www.discogs.com/master/113-Vince-Watson-Moments-In-Time', + 'info_dict': { + 'id': 'master113', + 'title': 'Moments In Time', + }, + 'playlist_mincount': 53, + }] + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + + display_id = f'{playlist_type}{playlist_id}' + response = self._download_json( + f'https://api.discogs.com/{playlist_type}s/{playlist_id}', display_id) + + entries = [ + self.url_result(video['uri'], YoutubeIE, video_title=video.get('title')) + for video in traverse_obj(response, ('videos', lambda _, v: YoutubeIE.suitable(v['uri'])))] + + return self.playlist_result(entries, display_id, response.get('title')) diff --git a/plugin/yt-dlp/yt_dlp/extractor/dropout.py b/plugin/yt-dlp/yt_dlp/extractor/dropout.py index bca0bb5..5a3963a 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/dropout.py +++ b/plugin/yt-dlp/yt_dlp/extractor/dropout.py @@ -1,13 +1,17 @@ +import functools + from .common import InfoExtractor from .vimeo import VHXEmbedIE from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, + extract_attributes, get_element_by_class, get_element_by_id, - get_elements_by_class, + get_elements_html_by_class, int_or_none, - join_nonempty, + traverse_obj, unified_strdate, urlencode_postdata, ) @@ -162,12 +166,13 @@ def _real_extract(self, url): class DropoutSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)' + _PAGE_SIZE = 24 + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P[^\/$&?#]+)(?:/?$|/season:(?P[0-9]+)/?$)' _TESTS = [ { 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1', 'note': 'Multi-season series with the season in the url', - 'playlist_count': 17, + 'playlist_count': 24, 'info_dict': { 'id': 'dimension-20-fantasy-high-season-1', 'title': 'Dimension 20 Fantasy High - Season 1' @@ -176,7 +181,7 @@ class DropoutSeasonIE(InfoExtractor): { 'url': 'https://www.dropout.tv/dimension-20-fantasy-high', 'note': 'Multi-season series with the season not in the url', - 'playlist_count': 17, + 'playlist_count': 24, 'info_dict': { 'id': 'dimension-20-fantasy-high-season-1', 'title': 'Dimension 20 Fantasy High - Season 1' @@ -190,29 +195,30 @@ class DropoutSeasonIE(InfoExtractor): 'id': 'dimension-20-shriek-week-season-1', 'title': 'Dimension 20 Shriek Week - Season 1' } + }, + { + 'url': 'https://www.dropout.tv/breaking-news-no-laugh-newsroom/season:3', + 'note': 'Multi-season series with season in the url that requires pagination', + 'playlist_count': 25, + 'info_dict': { + 'id': 'breaking-news-no-laugh-newsroom-season-3', + 'title': 'Breaking News No Laugh Newsroom - Season 3' + } } ] + def _fetch_page(self, url, season_id, page): + page += 1 + webpage = self._download_webpage( + f'{url}?page={page}', season_id, note=f'Downloading page {page}', expected_status={400}) + yield from [self.url_result(item_url, DropoutIE) for item_url in traverse_obj( + get_elements_html_by_class('browse-item-link', webpage), (..., {extract_attributes}, 'href'))] + def _real_extract(self, url): season_id = self._match_id(url) + season_num = self._match_valid_url(url).group('season') or 1 season_title = season_id.replace('-', ' ').title() - webpage = self._download_webpage(url, season_id) - - entries = [ - self.url_result( - url=self._search_regex(r']+selected>([^<]+)', - seasons, 'current_season', default='').strip() - return { - '_type': 'playlist', - 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')), - 'title': join_nonempty(season_title, current_season, delim=' - '), - 'entries': entries - } + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, url, season_id), self._PAGE_SIZE), + f'{season_id}-season-{season_num}', f'{season_title} - Season {season_num}') diff --git a/plugin/yt-dlp/yt_dlp/extractor/dumpert.py b/plugin/yt-dlp/yt_dlp/extractor/dumpert.py index e8493f7..4876235 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/dumpert.py +++ b/plugin/yt-dlp/yt_dlp/extractor/dumpert.py @@ -1,12 +1,17 @@ from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, qualities, ) class DumpertIE(InfoExtractor): - _VALID_URL = r'(?Phttps?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P[0-9]+[/_][0-9a-zA-Z]+)' + _VALID_URL = r'''(?x) + (?Phttps?)://(?:(?:www|legacy)\.)?dumpert\.nl(?: + /(?:mediabase|embed|item)/| + (?:/toppers|/latest|/?)\?selectedId= + )(?P[0-9]+[/_][0-9a-zA-Z]+)''' _TESTS = [{ 'url': 'https://www.dumpert.nl/item/6646981_951bc60f', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', @@ -16,6 +21,9 @@ class DumpertIE(InfoExtractor): 'title': 'Ik heb nieuws voor je', 'description': 'Niet schrikken hoor', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 9, + 'view_count': int, + 'like_count': int, } }, { 'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7', @@ -26,6 +34,28 @@ class DumpertIE(InfoExtractor): }, { 'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7', 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/item/100031688_b317a185', + 'info_dict': { + 'id': '100031688/b317a185', + 'ext': 'mp4', + 'title': 'Epic schijnbeweging', + 'description': '

Die zag je niet eh

', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'duration': 12, + 'view_count': int, + 'like_count': int, + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://www.dumpert.nl/toppers?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/latest?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/?selectedId=100031688_b317a185', + 'only_matching': True, }] def _real_extract(self, url): @@ -36,18 +66,23 @@ def _real_extract(self, url): title = item['title'] media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO') - quality = qualities(['flv', 'mobile', 'tablet', '720p']) + quality = qualities(['flv', 'mobile', 'tablet', '720p', '1080p']) formats = [] for variant in media.get('variants', []): uri = variant.get('uri') if not uri: continue version = variant.get('version') - formats.append({ - 'url': uri, - 'format_id': version, - 'quality': quality(version), - }) + preference = quality(version) + if determine_ext(uri) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', m3u8_id=version, quality=preference)) + else: + formats.append({ + 'url': uri, + 'format_id': version, + 'quality': preference, + }) thumbnails = [] stills = item.get('stills') or {} diff --git a/plugin/yt-dlp/yt_dlp/extractor/elevensports.py b/plugin/yt-dlp/yt_dlp/extractor/elevensports.py new file mode 100644 index 0000000..99c52b3 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/elevensports.py @@ -0,0 +1,59 @@ +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class ElevenSportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?elevensports\.com/view/event/(?P\w+)' + _TESTS = [{ + 'url': 'https://elevensports.com/view/event/clf46yr3kenn80jgrqsjmwefk', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'clf46yr3kenn80jgrqsjmwefk', + 'title': 'Cleveland SC vs Lionsbridge FC', + 'ext': 'mp4', + 'description': 'md5:03b5238d6549f4ea1fddadf69b5e0b58', + 'upload_date': '20230323', + 'timestamp': 1679612400, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://elevensports.com/view/event/clhpyd53b06160jez74qhgkmf', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'clhpyd53b06160jez74qhgkmf', + 'title': 'AJNLF vs ARRAF', + 'ext': 'mp4', + 'description': 'md5:c8c5e75c78f37c6d15cd6c475e43a8c1', + 'upload_date': '20230521', + 'timestamp': 1684684800, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + event_id = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['event']['mclsEventId'] + event_data = self._download_json( + f'https://mcls-api.mycujoo.tv/bff/events/v1beta1/{event_id}', video_id, + headers={'Authorization': 'Bearer FBVKACGN37JQC5SFA0OVK8KKSIOP153G'}) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + event_data['streams'][0]['full_url'], video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(event_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('start_time', {parse_iso8601}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/ettutv.py b/plugin/yt-dlp/yt_dlp/extractor/ettutv.py new file mode 100644 index 0000000..46d7255 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/ettutv.py @@ -0,0 +1,60 @@ +from .common import InfoExtractor +from ..utils import bool_or_none, traverse_obj, unified_timestamp, url_or_none + + +class EttuTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ettu\.tv/[^?#]+/playerpage/(?P[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.ettu.tv/en-int/playerpage/1573849', + 'md5': '5874b7639a2aa866d1f6c3a4037c7c09', + 'info_dict': { + 'id': '1573849', + 'title': 'Ni Xia Lian - Shao Jieni', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677348600, + 'upload_date': '20230225', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.ettu.tv/en-int/playerpage/1573753', + 'md5': '1fc094bf96cf2d5ec0f434d3a6dec9aa', + 'info_dict': { + 'id': '1573753', + 'title': 'Qiu Dang - Jorgic Darko', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677423600, + 'upload_date': '20230226', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_settings = self._download_json( + f'https://www.ettu.tv/api/v3/contents/{video_id}/player-settings', video_id, query={ + 'language': 'en', + 'showTitle': 'true', + 'device': 'desktop', + }) + + stream_response = self._download_json(player_settings['streamAccess'], video_id, data={}) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + stream_response['data']['stream'], video_id, 'mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(player_settings, { + 'title': 'title', + 'description': ('metaInformation', 'competition'), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('date', {unified_timestamp}), + 'is_live': ('isLivestream', {bool_or_none}), + }) + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/europa.py b/plugin/yt-dlp/yt_dlp/extractor/europa.py index e18a5f9..6eeb40f 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/europa.py +++ b/plugin/yt-dlp/yt_dlp/extractor/europa.py @@ -6,6 +6,7 @@ parse_iso8601, parse_qs, qualities, + traverse_obj, unified_strdate, xpath_text ) @@ -92,82 +93,81 @@ def get_item(type_, preference): class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:multimedia|webstreaming)\.europarl\.europa\.eu/[^/#?]+/ - (?:embed/embed\.html\?event=|(?!video)[^/#?]+/[\w-]+_)(?P[\w-]+) + https?://multimedia\.europarl\.europa\.eu/[^/#?]+/ + (?:(?!video)[^/#?]+/[\w-]+_)(?P[\w-]+) ''' _TESTS = [{ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { - 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', + 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'ext': 'mp4', - 'release_timestamp': 1663137900, 'title': 'Plenary session', + 'release_timestamp': 1663139069, 'release_date': '20220914', }, 'params': { 'skip_download': True, } }, { - 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/eu-cop27-un-climate-change-conference-in-sharm-el-sheikh-egypt-ep-delegation-meets-with-ngo-represen_20221114-1600-SPECIAL-OTHER', + # live webstream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', 'info_dict': { - 'id': 'a8428de8-b9cd-6a2e-11e4-3805d9c9ff5c', 'ext': 'mp4', - 'release_timestamp': 1668434400, - 'release_date': '20221114', - 'title': 'md5:d3550280c33cc70e0678652e3d52c028', + 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715', + 'release_timestamp': 1668502800, + 'title': 'Euroscola 2022-11-15 19:21', + 'release_date': '20221115', + 'live_status': 'is_live', }, - 'params': { - 'skip_download': True, - } + 'skip': 'not live anymore' }, { - # embed webpage - 'url': 'https://webstreaming.europarl.europa.eu/ep/embed/embed.html?event=20220914-0900-PLENARY&language=en&autoplay=true&logo=true', + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', 'info_dict': { - 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', + 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7', 'ext': 'mp4', - 'title': 'Plenary session', - 'release_date': '20220914', - 'release_timestamp': 1663137900, - }, - 'params': { - 'skip_download': True, + 'release_date': '20230301', + 'title': 'Committee on Culture and Education', + 'release_timestamp': 1677666641, } }, { - # live webstream - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', + # live stream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI', 'info_dict': { + 'id': 'e4255f56-10aa-4b3c-6530-08db56d5b0d9', 'ext': 'mp4', - 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715', - 'release_timestamp': 1668502800, - 'title': 'Euroscola 2022-11-15 19:21', - 'release_date': '20221115', + 'release_date': '20230524', + 'title': r're:Committee on Environment, Public Health and Food Safety \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', + 'release_timestamp': 1684911541, 'live_status': 'is_live', }, - 'skip': 'not live anymore' + 'skip': 'Not live anymore' }] def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + webpage_nextjs = self._search_nextjs_data(webpage, display_id)['props']['pageProps'] json_info = self._download_json( - 'https://vis-api.vuplay.co.uk/event/external', display_id, + 'https://acs-api.europarl.connectedviews.eu/api/FullMeeting', display_id, query={ - 'player_key': 'europarl|718f822c-a48c-4841-9947-c9cb9bb1743c', - 'external_id': display_id, + 'api-version': 1.0, + 'tenantId': 'bae646ca-1fc8-4363-80ba-2c04f06b4968', + 'externalReference': display_id }) - formats, subtitles = self._extract_mpd_formats_and_subtitles(json_info['streaming_url'], display_id) - fmts, subs = self._extract_m3u8_formats_and_subtitles( - json_info['streaming_url'].replace('.mpd', '.m3u8'), display_id) - - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) + formats, subtitles = [], {} + for hls_url in traverse_obj(json_info, ((('meetingVideo'), ('meetingVideos', ...)), 'hlsUrl')): + fmt, subs = self._extract_m3u8_formats_and_subtitles(hls_url, display_id) + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) return { 'id': json_info['id'], - 'title': json_info.get('title'), + 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False), 'formats': formats, 'subtitles': subtitles, - 'release_timestamp': parse_iso8601(json_info.get('published_start')), - 'is_live': 'LIVE' in json_info.get('state', '') + 'release_timestamp': parse_iso8601(json_info.get('startDateTime')), + 'is_live': traverse_obj(webpage_nextjs, ('mediaItem', 'mediaSubType')) == 'Live' } diff --git a/plugin/yt-dlp/yt_dlp/extractor/eurosport.py b/plugin/yt-dlp/yt_dlp/extractor/eurosport.py index ff742b6..82faf4a 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/eurosport.py +++ b/plugin/yt-dlp/yt_dlp/extractor/eurosport.py @@ -3,7 +3,7 @@ class EurosportIE(InfoExtractor): - _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?Pvid\d+)' + _VALID_URL = r'https?://www\.eurosport\.com/\w+/(?:[\w-]+/[\d-]+/)?[\w-]+_(?Pvid\d+)' _TESTS = [{ 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', 'info_dict': { @@ -44,6 +44,32 @@ class EurosportIE(InfoExtractor): 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', 'upload_date': '20220727', } + }, { + 'url': 'https://www.eurosport.com/football/champions-league/2022-2023/pep-guardiola-emotionally-destroyed-after-manchester-city-win-over-bayern-munich-in-champions-league_vid1896254/video.shtml', + 'info_dict': { + 'id': '3096477', + 'ext': 'mp4', + 'title': 'md5:82edc17370124c7a19b3cf518517583b', + 'duration': 84.0, + 'description': 'md5:b3f44ef7f5b5b95b24a273b163083feb', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/04/12/3682873-74947393-2560-1440.jpg', + 'timestamp': 1681292028, + 'upload_date': '20230412', + 'display_id': 'vid1896254', + } + }, { + 'url': 'https://www.eurosport.com/football/last-year-s-semi-final-pain-was-still-there-pep-guardiola-after-man-city-reach-cl-final_vid1914115/video.shtml', + 'info_dict': { + 'id': '3149108', + 'ext': 'mp4', + 'title': '\'Last year\'s semi-final pain was still there\' - Pep Guardiola after Man City reach CL final', + 'description': 'md5:89ef142fe0170a66abab77fac2955d8e', + 'display_id': 'vid1914115', + 'timestamp': 1684403618, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/05/18/3707254-75435008-2560-1440.jpg', + 'duration': 105.0, + 'upload_date': '20230518', + } }] _TOKEN = None diff --git a/plugin/yt-dlp/yt_dlp/extractor/foxnews.py b/plugin/yt-dlp/yt_dlp/extractor/foxnews.py index 41bc50e..d55fc53 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/foxnews.py +++ b/plugin/yt-dlp/yt_dlp/extractor/foxnews.py @@ -7,8 +7,37 @@ class FoxNewsIE(AMPIE): IE_NAME = 'foxnews' IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?Pvideo\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://video\.(?:insider\.)?fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ + { + 'url': 'https://video.foxnews.com/v/6320653836112', + 'info_dict': { + 'id': '6320653836112', + 'ext': 'mp4', + 'title': 'Tucker Carlson joins \'Gutfeld!\' to discuss his new documentary', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 404, + 'upload_date': '20230217', + 'description': 'md5:858a8a36f59e9ca897d758855bcdfa02', + 'timestamp': 1676611344.0, + }, + 'params': {'skip_download': 'm3u8'}, + }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'info_dict': { + 'id': '5099377331001', + 'ext': 'mp4', + 'title': '82416_censoring', + 'description': '82416_censoring', + 'upload_date': '20160826', + 'timestamp': 1472169708.0, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 521, + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', 'md5': '32aaded6ba3ef0d1c04e238d01031e5e', @@ -22,6 +51,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20110503', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'skip': '404 page', }, { 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips', @@ -36,10 +66,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20141204', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': 'm3u8 HTTP error 400 in web browser', }, { 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', @@ -49,11 +76,6 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, - { - # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words - 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', - 'only_matching': True, - }, ] @classmethod @@ -67,10 +89,10 @@ def _extract_embed_urls(cls, url, webpage): yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() + video_id = self._match_id(url) info = self._extract_feed_info( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + f'https://api.foxnews.com/v3/video-player/{video_id}?callback=uid_{video_id}') info['id'] = video_id return info @@ -78,6 +100,19 @@ def _real_extract(self, url): class FoxNewsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P\d+)' _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6328632286112', + 'info_dict': { + 'id': '6328632286112', + 'ext': 'mp4', + 'title': 'Review: 2023 Toyota Prius Prime', + 'duration': 155, + 'thumbnail': r're:^https://.+\.jpg$', + 'timestamp': 1685720177.0, + 'upload_date': '20230602', + 'description': 'md5:b69aafb125b41c1402e9744f53d6edc4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'https://www.foxnews.com/video/6313058664112', 'info_dict': { 'id': '6313058664112', @@ -89,8 +124,7 @@ class FoxNewsVideoIE(InfoExtractor): 'title': 'Gutfeld! - Thursday, September 29', 'timestamp': 1664527538, }, - 'expected_warnings': ['Ignoring subtitle tracks'], - 'params': {'skip_download': 'm3u8'}, + 'skip': '404 page', }] def _real_extract(self, url): @@ -104,19 +138,22 @@ class FoxNewsArticleIE(InfoExtractor): _TESTS = [{ # data-video-id - 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '83d44e1aff1433e7a29a7b537d1700b5', + 'url': 'https://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': 'd2dd6ce809cedeefa96460e964821437', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', - 'description': 'Veterans react on \'The Kelly File\'', + 'description': 'Veterans and Fox News host Dana Perino react on \'The Kelly File\' to NBC\'s presidential forum', 'timestamp': 1473301045, 'upload_date': '20160908', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 426, }, + 'params': {'skip_download': 'm3u8'}, }, { # iframe embed - 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'url': 'https://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', 'info_dict': { 'id': '5748266721001', 'ext': 'flv', @@ -127,9 +164,7 @@ class FoxNewsArticleIE(InfoExtractor): 'timestamp': 1520594670, 'upload_date': '20180309', }, - 'params': { - 'skip_download': True, - }, + 'skip': '404 page', }, { 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', 'only_matching': True, diff --git a/plugin/yt-dlp/yt_dlp/extractor/funker530.py b/plugin/yt-dlp/yt_dlp/extractor/funker530.py new file mode 100644 index 0000000..ba5ab7d --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/funker530.py @@ -0,0 +1,79 @@ +from .common import InfoExtractor +from .rumble import RumbleEmbedIE +from .youtube import YoutubeIE +from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none + + +class Funker530IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/', + 'md5': '085f50fea27523a388bbc22e123e09c8', + 'info_dict': { + 'id': 'v2qbmu4', + 'ext': 'mp4', + 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Funker530', + 'channel': 'Funker530', + 'channel_url': 'https://rumble.com/c/c-1199543', + 'width': 1280, + 'height': 720, + 'fps': 25, + 'duration': 27, + 'upload_date': '20230608', + 'timestamp': 1686241321, + 'live_status': 'not_live', + 'description': 'md5:bea2e1f458095414e04b5ac189c2f980', + } + }, { + 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/', + 'md5': 'a42c2933391210662e93e867d7124b70', + 'info_dict': { + 'id': 'k-pk4bOvoac', + 'ext': 'mp4', + 'view_count': int, + 'channel': 'Civ Div', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg', + 'uploader_id': '@CivDiv', + 'duration': 357, + 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@CivDiv', + 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A', + 'like_count': int, + 'description': 'md5:aef75ec3f59c07a0e39400f609b24429', + 'live_status': 'not_live', + 'age_limit': 0, + 'uploader': 'Civ Div', + 'categories': ['People & Blogs'], + 'title': 'My “Friends” joined the Russians.', + 'availability': 'public', + 'upload_date': '20230608', + 'playable_in_embed': True, + 'heatmap': 'count:100', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) + if rumble_url: + info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} + else: + youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) + if youtube_url: + info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} + if not info: + raise ExtractorError('No videos found on webpage', expected=True) + + return { + **info, + '_type': 'url_transparent', + 'description': strip_or_none(self._search_regex( + r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)), + 'description', default=None)) + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/hotstar.py b/plugin/yt-dlp/yt_dlp/extractor/hotstar.py index 67c88ce..baf97db 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/hotstar.py +++ b/plugin/yt-dlp/yt_dlp/extractor/hotstar.py @@ -83,7 +83,7 @@ class HotStarIE(HotStarBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) (?: - (?Pmovies|sports|episode|(?Ptv))/ + (?Pmovies|sports|episode|(?Ptv|shows))/ (?(tv)(?:[^/?#]+/){2}|[^?#]*) )? [^/?#]+/ @@ -122,6 +122,25 @@ class HotStarIE(HotStarBaseIE): 'episode': 'Janhvi Targets Suman', 'episode_number': 8, } + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/anupama-anuj-share-a-moment/1000282843', + 'info_dict': { + 'id': '1000282843', + 'ext': 'mp4', + 'title': 'Anupama, Anuj Share a Moment', + 'season': 'Chapter 1', + 'description': 'md5:8d74ed2248423b8b06d5c8add4d7a0c0', + 'timestamp': 1678149000, + 'channel': 'StarPlus', + 'series': 'Anupama', + 'season_number': 1, + 'season_id': 7399, + 'upload_date': '20230307', + 'episode': 'Anupama, Anuj Share a Moment', + 'episode_number': 853, + 'duration': 1272, + 'channel_id': 3, + }, }, { 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', 'only_matching': True, @@ -139,6 +158,7 @@ class HotStarIE(HotStarBaseIE): 'sports': 'match', 'episode': 'episode', 'tv': 'episode', + 'shows': 'episode', None: 'content', } @@ -304,13 +324,16 @@ def _real_extract(self, url): class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)(?:/[^/]+){2}/list/[^/]+/t-(?P\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { 'id': '3_2_26', }, 'playlist_mincount': 20, + }, { + 'url': 'https://www.hotstar.com/shows/savdhaan-india/s-26/list/popular-clips/t-3_2_26', + 'only_matching': True, }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, @@ -327,7 +350,7 @@ def _real_extract(self, url): class HotStarSeasonIE(HotStarBaseIE): IE_NAME = 'hotstar:season' - _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P\w+)' + _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/\w+)/seasons/[^/]+/ss-(?P\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028', 'info_dict': { @@ -346,6 +369,9 @@ class HotStarSeasonIE(HotStarBaseIE): 'id': '8208', }, 'playlist_mincount': 19, + }, { + 'url': 'https://www.hotstar.com/in/shows/bigg-boss/14714/seasons/season-4/ss-8208/', + 'only_matching': True, }] def _real_extract(self, url): @@ -356,7 +382,7 @@ def _real_extract(self, url): class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P\d+))/?(?:[#?]|$)' + _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/(?P\d+))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -375,6 +401,12 @@ class HotStarSeriesIE(HotStarBaseIE): 'id': '435', }, 'playlist_mincount': 267, + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/', + 'info_dict': { + 'id': '1260022017', + }, + 'playlist_mincount': 940, }] def _real_extract(self, url): diff --git a/plugin/yt-dlp/yt_dlp/extractor/idolplus.py b/plugin/yt-dlp/yt_dlp/extractor/idolplus.py new file mode 100644 index 0000000..3c905b0 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/idolplus.py @@ -0,0 +1,115 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, try_call, url_or_none + + +class IdolPlusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?idolplus\.com/z[us]/(?:concert/|contents/?\?(?:[^#]+&)?albumId=)(?P\w+)' + _TESTS = [{ + 'url': 'https://idolplus.com/zs/contents?albumId=M012077298PPV00', + 'md5': '2ace3f4661c943a2f7e79f0b88cea1e7', + 'info_dict': { + 'id': 'M012077298PPV00', + 'ext': 'mp4', + 'title': '[MultiCam] Aegyo on Top of Aegyo (IZ*ONE EATING TRIP)', + 'release_date': '20200707', + 'formats': 'count:65', + }, + 'params': {'format': '532-KIM_MINJU'}, + }, { + 'url': 'https://idolplus.com/zs/contents?albumId=M01232H058PPV00&catId=E9TX5', + 'info_dict': { + 'id': 'M01232H058PPV00', + 'ext': 'mp4', + 'title': 'YENA (CIRCLE CHART MUSIC AWARDS 2022 RED CARPET)', + 'release_date': '20230218', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # live stream + 'url': 'https://idolplus.com/zu/contents?albumId=M012323174PPV00', + 'info_dict': { + 'id': 'M012323174PPV00', + 'ext': 'mp4', + 'title': 'Hanteo Music Awards 2022 DAY2', + 'release_date': '20230211', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://idolplus.com/zs/concert/M012323039PPV00', + 'info_dict': { + 'id': 'M012323039PPV00', + 'ext': 'mp4', + 'title': 'CIRCLE CHART MUSIC AWARDS 2022', + 'release_date': '20230218', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data_list = traverse_obj(self._download_json( + 'https://idolplus.com/api/zs/viewdata/ruleset/build', video_id, + headers={'App_type': 'web', 'Country_Code': 'KR'}, query={ + 'rulesetId': 'contents', + 'albumId': video_id, + 'distribute': 'PRD', + 'loggedIn': 'false', + 'region': 'zs', + 'countryGroup': '00010', + 'lang': 'en', + 'saId': '999999999998', + }), ('data', 'viewData', ...)) + + player_data = {} + while data_list: + player_data = data_list.pop() + if traverse_obj(player_data, 'type') == 'player': + break + elif traverse_obj(player_data, ('dataList', ...)): + data_list += player_data['dataList'] + + formats = self._extract_m3u8_formats(traverse_obj(player_data, ( + 'vodPlayerList', 'vodProfile', 0, 'vodServer', 0, 'video_url', {url_or_none})), video_id) + + subtitles = {} + for caption in traverse_obj(player_data, ('vodPlayerList', 'caption')) or []: + subtitles.setdefault(caption.get('lang') or 'und', []).append({ + 'url': caption.get('smi_url'), + 'ext': 'vtt', + }) + + # Add member multicams as alternative formats + if (traverse_obj(player_data, ('detail', 'has_cuesheet')) == 'Y' + and traverse_obj(player_data, ('detail', 'is_omni_member')) == 'Y'): + cuesheet = traverse_obj(self._download_json( + 'https://idolplus.com/gapi/contents/v1.0/content/cuesheet', video_id, + 'Downloading JSON metadata for member multicams', + headers={'App_type': 'web', 'Country_Code': 'KR'}, query={ + 'ALBUM_ID': video_id, + 'COUNTRY_GRP': '00010', + 'LANG': 'en', + 'SA_ID': '999999999998', + 'COUNTRY_CODE': 'KR', + }), ('data', 'cuesheet_item', 0)) + + for member in traverse_obj(cuesheet, ('members', ...)): + index = try_call(lambda: int(member['omni_view_index']) - 1) + member_video_url = traverse_obj(cuesheet, ('omni_view', index, 'cdn_url', 0, 'url', {url_or_none})) + if not member_video_url: + continue + member_formats = self._extract_m3u8_formats( + member_video_url, video_id, note=f'Downloading m3u8 for multicam {member["name"]}') + for mf in member_formats: + mf['format_id'] = f'{mf["format_id"]}-{member["name"].replace(" ", "_")}' + formats.extend(member_formats) + + return { + 'id': video_id, + 'title': traverse_obj(player_data, ('detail', 'albumName')), + 'formats': formats, + 'subtitles': subtitles, + 'release_date': traverse_obj(player_data, ('detail', 'broadcastDate')), + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/iwara.py b/plugin/yt-dlp/yt_dlp/extractor/iwara.py index 6b1fc0d..78c3033 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/iwara.py +++ b/plugin/yt-dlp/yt_dlp/extractor/iwara.py @@ -1,68 +1,83 @@ import functools import urllib.parse +import urllib.error import hashlib import json +import time from .common import InfoExtractor from ..utils import ( ExtractorError, OnDemandPagedList, int_or_none, + jwt_decode_hs256, mimetype2ext, qualities, traverse_obj, + try_call, unified_timestamp, ) -# https://github.com/yt-dlp/yt-dlp/issues/6671 class IwaraBaseIE(InfoExtractor): + _NETRC_MACHINE = 'iwara' _USERTOKEN = None _MEDIATOKEN = None - _NETRC_MACHINE = 'iwara' - def _get_user_token(self, invalidate=False): - if not invalidate and self._USERTOKEN: - return self._USERTOKEN + def _is_token_expired(self, token, token_type): + # User token TTL == ~3 weeks, Media token TTL == ~1 hour + if (try_call(lambda: jwt_decode_hs256(token)['exp']) or 0) <= int(time.time() - 120): + self.to_screen(f'{token_type} token has expired') + return True + def _get_user_token(self): username, password = self._get_login_info() - IwaraBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username) - if not IwaraBaseIE._USERTOKEN or invalidate: - IwaraBaseIE._USERTOKEN = self._download_json( + if not username or not password: + return + + user_token = IwaraBaseIE._USERTOKEN or self.cache.load(self._NETRC_MACHINE, username) + if not user_token or self._is_token_expired(user_token, 'User'): + response = self._download_json( 'https://api.iwara.tv/user/login', None, note='Logging in', - data=json.dumps({ + headers={'Content-Type': 'application/json'}, data=json.dumps({ 'email': username, 'password': password - }).encode('utf-8'), - headers={ + }).encode(), expected_status=lambda x: True) + user_token = traverse_obj(response, ('token', {str})) + if not user_token: + error = traverse_obj(response, ('message', {str})) + if 'invalidLogin' in error: + raise ExtractorError('Invalid login credentials', expected=True) + else: + raise ExtractorError(f'Iwara API said: {error or "nothing"}') + + self.cache.store(self._NETRC_MACHINE, username, user_token) + + IwaraBaseIE._USERTOKEN = user_token + + def _get_media_token(self): + self._get_user_token() + if not IwaraBaseIE._USERTOKEN: + return # user has not passed credentials + + if not IwaraBaseIE._MEDIATOKEN or self._is_token_expired(IwaraBaseIE._MEDIATOKEN, 'Media'): + IwaraBaseIE._MEDIATOKEN = self._download_json( + 'https://api.iwara.tv/user/token', None, note='Fetching media token', + data=b'', headers={ + 'Authorization': f'Bearer {IwaraBaseIE._USERTOKEN}', 'Content-Type': 'application/json' - })['token'] - - self.cache.store(self._NETRC_MACHINE, username, IwaraBaseIE._USERTOKEN) + })['accessToken'] - return self._USERTOKEN + return {'Authorization': f'Bearer {IwaraBaseIE._MEDIATOKEN}'} - def _get_media_token(self, invalidate=False): - if not invalidate and self._MEDIATOKEN: - return self._MEDIATOKEN - - IwaraBaseIE._MEDIATOKEN = self._download_json( - 'https://api.iwara.tv/user/token', None, note='Fetching media token', - data=b'', # Need to have some data here, even if it's empty - headers={ - 'Authorization': f'Bearer {self._get_user_token()}', - 'Content-Type': 'application/json' - })['accessToken'] - - return self._MEDIATOKEN + def _perform_login(self, username, password): + self._get_media_token() class IwaraIE(IwaraBaseIE): IE_NAME = 'iwara' _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P[a-zA-Z0-9]+)' _TESTS = [{ - # this video cannot be played because of migration - 'only_matching': True, 'url': 'https://www.iwara.tv/video/k2ayoueezfkx6gvq', 'info_dict': { 'id': 'k2ayoueezfkx6gvq', @@ -79,25 +94,29 @@ class IwaraIE(IwaraBaseIE): 'timestamp': 1677843869, 'modified_timestamp': 1679056362, }, + 'skip': 'this video cannot be played because of migration', }, { 'url': 'https://iwara.tv/video/1ywe1sbkqwumpdxz5/', - 'md5': '20691ce1473ec2766c0788e14c60ce66', + 'md5': '7645f966f069b8ec9210efd9130c9aad', 'info_dict': { 'id': '1ywe1sbkqwumpdxz5', 'ext': 'mp4', 'age_limit': 18, - 'title': 'Aponia 阿波尼亚SEX Party Tonight 手动脱衣 大奶 裸腿', - 'description': 'md5:0c4c310f2e0592d68b9f771d348329ca', - 'uploader': '龙也zZZ', + 'title': 'Aponia アポニア SEX Party Tonight 手の脱衣 巨乳 ', + 'description': 'md5:3f60016fff22060eef1ef26d430b1f67', + 'uploader': 'Lyu ya', 'uploader_id': 'user792540', 'tags': [ 'uncategorized' ], - 'like_count': 1809, - 'view_count': 25156, - 'comment_count': 1, + 'like_count': int, + 'view_count': int, + 'comment_count': int, 'timestamp': 1678732213, - 'modified_timestamp': 1679110271, + 'modified_timestamp': int, + 'thumbnail': 'https://files.iwara.tv/image/thumbnail/581d12b5-46f4-4f15-beb2-cfe2cde5d13d/thumbnail-00.jpg', + 'modified_date': '20230614', + 'upload_date': '20230313', }, }, { 'url': 'https://iwara.tv/video/blggmfno8ghl725bg', @@ -112,12 +131,15 @@ class IwaraIE(IwaraBaseIE): 'tags': [ 'pee' ], - 'like_count': 192, - 'view_count': 12119, - 'comment_count': 0, + 'like_count': int, + 'view_count': int, + 'comment_count': int, 'timestamp': 1598880567, - 'modified_timestamp': 1598908995, - 'availability': 'needs_auth', + 'modified_timestamp': int, + 'upload_date': '20200831', + 'modified_date': '20230605', + 'thumbnail': 'https://files.iwara.tv/image/thumbnail/7693e881-d302-42a4-a780-f16d66b5dadd/thumbnail-00.jpg', + # 'availability': 'needs_auth', }, }] @@ -142,17 +164,16 @@ def _extract_formats(self, video_id, fileurl): def _real_extract(self, url): video_id = self._match_id(url) - username, password = self._get_login_info() - headers = { - 'Authorization': f'Bearer {self._get_media_token()}', - } if username and password else None - video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True, headers=headers) + username, _ = self._get_login_info() + video_data = self._download_json( + f'https://api.iwara.tv/video/{video_id}', video_id, + expected_status=lambda x: True, headers=self._get_media_token()) errmsg = video_data.get('message') # at this point we can actually get uploaded user info, but do we need it? if errmsg == 'errors.privateVideo': - self.raise_login_required('Private video. Login if you have permissions to watch') + self.raise_login_required('Private video. Login if you have permissions to watch', method='password') elif errmsg == 'errors.notFound' and not username: - self.raise_login_required('Video may need login to view') + self.raise_login_required('Video may need login to view', method='password') elif errmsg: # None if success raise ExtractorError(f'Iwara says: {errmsg}') @@ -181,15 +202,6 @@ def _real_extract(self, url): 'formats': list(self._extract_formats(video_id, video_data.get('fileUrl'))), } - def _perform_login(self, username, password): - if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token(): - self.write_debug('Skipping logging in') - return - - IwaraBaseIE._USERTOKEN = self._get_user_token(True) - self._get_media_token(True) - self.cache.store(self._NETRC_MACHINE, username, IwaraBaseIE._USERTOKEN) - class IwaraUserIE(IwaraBaseIE): _VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P[^/?#&]+)' @@ -200,12 +212,14 @@ class IwaraUserIE(IwaraBaseIE): 'url': 'https://iwara.tv/profile/user792540/videos', 'info_dict': { 'id': 'user792540', + 'title': 'Lyu ya', }, - 'playlist_mincount': 80, + 'playlist_mincount': 70, }, { 'url': 'https://iwara.tv/profile/theblackbirdcalls/videos', 'info_dict': { 'id': 'theblackbirdcalls', + 'title': 'TheBlackbirdCalls', }, 'playlist_mincount': 723, }, { @@ -214,6 +228,13 @@ class IwaraUserIE(IwaraBaseIE): }, { 'url': 'https://iwara.tv/profile/theblackbirdcalls', 'only_matching': True, + }, { + 'url': 'https://www.iwara.tv/profile/lumymmd', + 'info_dict': { + 'id': 'lumymmd', + 'title': 'Lumy MMD', + }, + 'playlist_mincount': 1, }] def _entries(self, playlist_id, user_id, page): @@ -225,7 +246,7 @@ def _entries(self, playlist_id, user_id, page): 'sort': 'date', 'user': user_id, 'limit': self._PER_PAGE, - }) + }, headers=self._get_media_token()) for x in traverse_obj(videos, ('results', ..., 'id')): yield self.url_result(f'https://iwara.tv/video/{x}') @@ -244,7 +265,6 @@ def _real_extract(self, url): class IwaraPlaylistIE(IwaraBaseIE): - # the ID is an UUID but I don't think it's necessary to write concrete regex _VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P[0-9a-f-]+)' IE_NAME = 'iwara:playlist' _PER_PAGE = 32 @@ -260,7 +280,8 @@ class IwaraPlaylistIE(IwaraBaseIE): def _entries(self, playlist_id, first_page, page): videos = self._download_json( 'https://api.iwara.tv/videos', playlist_id, f'Downloading page {page}', - query={'page': page, 'limit': self._PER_PAGE}) if page else first_page + query={'page': page, 'limit': self._PER_PAGE}, + headers=self._get_media_token()) if page else first_page for x in traverse_obj(videos, ('results', ..., 'id')): yield self.url_result(f'https://iwara.tv/video/{x}') @@ -268,7 +289,7 @@ def _real_extract(self, url): playlist_id = self._match_id(url) page_0 = self._download_json( f'https://api.iwara.tv/playlist/{playlist_id}?page=0&limit={self._PER_PAGE}', playlist_id, - note='Requesting playlist info') + note='Requesting playlist info', headers=self._get_media_token()) return self.playlist_result( OnDemandPagedList( diff --git a/plugin/yt-dlp/yt_dlp/extractor/jstream.py b/plugin/yt-dlp/yt_dlp/extractor/jstream.py new file mode 100644 index 0000000..3e2e627 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/jstream.py @@ -0,0 +1,73 @@ +import base64 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + js_to_json, + remove_start, +) + + +class JStreamIE(InfoExtractor): + # group "id" only exists for compliance, not directly used in requests + # also all components are mandatory + _VALID_URL = r'jstream:(?Pwww\d+):(?P(?P[a-z0-9]+):(?P\d+))' + + _TESTS = [{ + 'url': 'jstream:www50:eqd638pvwx:752', + 'info_dict': { + 'id': 'eqd638pvwx:752', + 'ext': 'mp4', + 'title': '阪神淡路大震災 激震の記録2020年版 解説動画', + 'duration': 672, + 'thumbnail': r're:https?://eqd638pvwx\.eq\.webcdn\.stream\.ne\.jp/.+\.jpg', + }, + }] + + def _parse_jsonp(self, callback, string, video_id): + return self._search_json(rf'\s*{re.escape(callback)}\s*\(', string, callback, video_id) + + def _find_formats(self, video_id, movie_list_hls, host, publisher, subtitles): + for value in movie_list_hls: + text = value.get('text') or '' + if not text.startswith('auto'): + continue + m3u8_id = remove_start(remove_start(text, 'auto'), '_') or None + fmts, subs = self._extract_m3u8_formats_and_subtitles( + f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/{value.get("url")}', video_id, 'mp4', m3u8_id=m3u8_id) + self._merge_subtitles(subs, target=subtitles) + yield from fmts + + def _real_extract(self, url): + host, publisher, mid, video_id = self._match_valid_url(url).group('host', 'publisher', 'mid', 'id') + video_info_jsonp = self._download_webpage( + f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/eq_meta/v1/{mid}.jsonp', + video_id, 'Requesting video info') + video_info = self._parse_jsonp('metaDataResult', video_info_jsonp, video_id)['movie'] + subtitles = {} + formats = list(self._find_formats(video_id, video_info.get('movie_list_hls'), host, publisher, subtitles)) + self._remove_duplicate_formats(formats) + return { + 'id': video_id, + 'title': video_info.get('title'), + 'duration': float_or_none(video_info.get('duration')), + 'thumbnail': video_info.get('thumbnail_url'), + 'formats': formats, + 'subtitles': subtitles, + } + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # check for eligiblity of webpage + # https://support.eq.stream.co.jp/hc/ja/articles/115008388147-%E3%83%97%E3%83%AC%E3%82%A4%E3%83%A4%E3%83%BCAPI%E3%81%AE%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%82%B3%E3%83%BC%E3%83%89 + script_tag = re.search(r']+?src="https://ssl-cache\.stream\.ne\.jp/(?Pwww\d+)/(?P[a-z0-9]+)/[^"]+?/if\.js"', webpage) + if not script_tag: + return + host, publisher = script_tag.groups() + for m in re.finditer(r'(?s)PlayerFactoryIF\.create\(\s*({[^\}]+?})\s*\)\s*;', webpage): + # TODO: using json.loads here as InfoExtractor._parse_json is not classmethod + info = json.loads(js_to_json(m.group(1))) + mid = base64.b64decode(info.get('m')).decode() + yield f'jstream:{host}:{publisher}:{mid}' diff --git a/plugin/yt-dlp/yt_dlp/extractor/ketnet.py b/plugin/yt-dlp/yt_dlp/extractor/ketnet.py deleted file mode 100644 index 8f22294..0000000 --- a/plugin/yt-dlp/yt_dlp/extractor/ketnet.py +++ /dev/null @@ -1,70 +0,0 @@ -from .canvas import CanvasIE -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - int_or_none, - parse_iso8601, -) - - -class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P(?:[^/]+/)*[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', - 'ext': 'mp4', - 'title': 'Nachtwacht - Reeks 3: Aflevering 1', - 'description': 'De Nachtwacht krijgt te maken met een parasiet', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - 'timestamp': 1609225200, - 'upload_date': '20201229', - 'series': 'Nachtwacht', - 'season': 'Reeks 3', - 'episode': 'De Greystook', - 'episode_number': 1, - }, - 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], - }, { - 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - video = self._download_json( - 'https://senior-bff.ketnet.be/graphql', display_id, query={ - 'query': '''{ - video(id: "content/ketnet/nl/%s.model.json") { - description - episodeNr - imageUrl - mediaReference - programTitle - publicationDate - seasonTitle - subtitleVideodetail - titleVideodetail - } -}''' % display_id, - })['data']['video'] - - mz_id = compat_urllib_parse_unquote(video['mediaReference']) - - return { - '_type': 'url_transparent', - 'id': mz_id, - 'title': video['titleVideodetail'], - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, - 'thumbnail': video.get('imageUrl'), - 'description': video.get('description'), - 'timestamp': parse_iso8601(video.get('publicationDate')), - 'series': video.get('programTitle'), - 'season': video.get('seasonTitle'), - 'episode': video.get('subtitleVideodetail'), - 'episode_number': int_or_none(video.get('episodeNr')), - 'ie_key': CanvasIE.ie_key(), - } diff --git a/plugin/yt-dlp/yt_dlp/extractor/lbry.py b/plugin/yt-dlp/yt_dlp/extractor/lbry.py index 790f726..4e3b161 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/lbry.py +++ b/plugin/yt-dlp/yt_dlp/extractor/lbry.py @@ -1,8 +1,8 @@ import functools import json +import urllib.parse from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote from ..utils import ( ExtractorError, HEADRequest, @@ -12,7 +12,10 @@ int_or_none, mimetype2ext, parse_qs, + traverse_obj, try_get, + url_or_none, + urlhandle_detect_ext, urljoin, ) @@ -52,38 +55,25 @@ def _permanent_url(self, url, claim_name, claim_id): '/%s:%s' % (claim_name, claim_id)) def _parse_stream(self, stream, url): - stream_value = stream.get('value') or {} - stream_type = stream_value.get('stream_type') - source = stream_value.get('source') or {} - media = stream_value.get(stream_type) or {} - signing_channel = stream.get('signing_channel') or {} - channel_name = signing_channel.get('name') - channel_claim_id = signing_channel.get('claim_id') - channel_url = None - if channel_name and channel_claim_id: - channel_url = self._permanent_url(url, channel_name, channel_claim_id) + stream_type = traverse_obj(stream, ('value', 'stream_type', {str})) + + info = traverse_obj(stream, { + 'title': ('value', 'title', {str}), + 'thumbnail': ('value', 'thumbnail', 'url', {url_or_none}), + 'description': ('value', 'description', {str}), + 'license': ('value', 'license', {str}), + 'timestamp': ('timestamp', {int_or_none}), + 'release_timestamp': ('value', 'release_time', {int_or_none}), + 'tags': ('value', 'tags', ..., {lambda x: x or None}), + 'duration': ('value', stream_type, 'duration', {int_or_none}), + 'channel': ('signing_channel', 'value', 'title', {str}), + 'channel_id': ('signing_channel', 'claim_id', {str}), + }) + + channel_name = traverse_obj(stream, ('signing_channel', 'name', {str})) + if channel_name and info.get('channel_id'): + info['channel_url'] = self._permanent_url(url, channel_name, info['channel_id']) - info = { - 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str), - 'description': stream_value.get('description'), - 'license': stream_value.get('license'), - 'timestamp': int_or_none(stream.get('timestamp')), - 'release_timestamp': int_or_none(stream_value.get('release_time')), - 'tags': stream_value.get('tags'), - 'duration': int_or_none(media.get('duration')), - 'channel': try_get(signing_channel, lambda x: x['value']['title']), - 'channel_id': channel_claim_id, - 'channel_url': channel_url, - 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), - 'filesize': int_or_none(source.get('size')), - } - if stream_type == 'audio': - info['vcodec'] = 'none' - else: - info.update({ - 'width': int_or_none(media.get('width')), - 'height': int_or_none(media.get('height')), - }) return info @@ -186,6 +176,28 @@ class LBRYIE(LBRYBaseIE): 'license': 'None', }, 'params': {'skip_download': True} + }, { + # original quality format w/higher resolution than HLS formats + 'url': 'https://odysee.com/@wickedtruths:2/Biotechnological-Invasion-of-Skin-(April-2023):4', + 'md5': '305b0b3b369bde1b984961f005b67193', + 'info_dict': { + 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634', + 'ext': 'mp4', + 'title': 'Biotechnological Invasion of Skin (April 2023)', + 'description': 'md5:709a2f4c07bd8891cda3a7cc2d6fcf5c', + 'channel': 'Wicked Truths', + 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'timestamp': 1685790036, + 'upload_date': '20230603', + 'release_timestamp': 1685617473, + 'release_date': '20230601', + 'duration': 1063, + 'thumbnail': 'https://thumbs.odycdn.com/4e6d39da4df0cfdad45f64e253a15959.webp', + 'tags': ['smart skin surveillance', 'biotechnology invasion of skin', 'morgellons'], + 'license': 'None', + 'protocol': 'https', # test for direct mp4 download + }, }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -221,41 +233,64 @@ def _real_extract(self, url): display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') - display_id = compat_urllib_parse_unquote(display_id) + display_id = urllib.parse.unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') headers = {'Referer': 'https://odysee.com/'} - if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: + + formats = [] + stream_type = traverse_obj(result, ('value', 'stream_type', {str})) + + if stream_type in self._SUPPORTED_STREAM_TYPES: claim_id, is_live = result['claim_id'], False streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + + # GET request returns original video/audio file if available + ext = urlhandle_detect_ext(self._request_webpage( + streaming_url, display_id, 'Checking for original quality', headers=headers)) + if ext != 'm3u8': + formats.append({ + 'url': streaming_url, + 'format_id': 'original', + 'quality': 1, + **traverse_obj(result, ('value', { + 'ext': ('source', (('name', {determine_ext}), ('media_type', {mimetype2ext}))), + 'filesize': ('source', 'size', {int_or_none}), + 'width': ('video', 'width', {int_or_none}), + 'height': ('video', 'height', {int_or_none}), + }), get_all=False), + 'vcodec': 'none' if stream_type == 'audio' else None, + }) + + # HEAD request returns redirect response to m3u8 URL if available final_url = self._request_webpage( HEADRequest(streaming_url), display_id, headers=headers, note='Downloading streaming redirect url info').geturl() + elif result.get('value_type') == 'stream': claim_id, is_live = result['signing_channel']['claim_id'], True live_data = self._download_json( 'https://api.odysee.live/livestream/is_live', claim_id, query={'channel_claim_id': claim_id}, note='Downloading livestream JSON metadata')['data'] - streaming_url = final_url = live_data.get('VideoURL') + final_url = live_data.get('VideoURL') # Upcoming videos may still give VideoURL if not live_data.get('Live'): - streaming_url = final_url = None + final_url = None self.raise_no_formats('This stream is not live', True, claim_id) + else: raise UnsupportedError(url) - info = self._parse_stream(result, url) if determine_ext(final_url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) - else: - info['url'] = streaming_url + formats.extend(self._extract_m3u8_formats( + final_url, display_id, 'mp4', m3u8_id='hls', live=is_live, headers=headers)) + return { - **info, + **self._parse_stream(result, url), 'id': claim_id, - 'title': result['value']['title'], + 'formats': formats, 'is_live': is_live, 'http_headers': headers, } @@ -299,14 +334,12 @@ def _fetch_page(self, claim_id, url, params, page): if not (stream_claim_name and stream_claim_id): continue - info = self._parse_stream(item, url) - info.update({ + yield { + **self._parse_stream(item, url), '_type': 'url', 'id': stream_claim_id, - 'title': try_get(item, lambda x: x['value']['title']), 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - }) - yield info + } def _real_extract(self, url): display_id = self._match_id(url).replace(':', '#') diff --git a/plugin/yt-dlp/yt_dlp/extractor/litv.py b/plugin/yt-dlp/yt_dlp/extractor/litv.py index 40f22a2..8968451 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/litv.py +++ b/plugin/yt-dlp/yt_dlp/extractor/litv.py @@ -4,8 +4,8 @@ from ..utils import ( ExtractorError, int_or_none, - traverse_obj, smuggle_url, + traverse_obj, unsmuggle_url, ) @@ -113,7 +113,7 @@ def _real_extract(self, url): entry_protocol='m3u8_native', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity' title = program_info['title'] + program_info.get('secondaryMark', '') description = program_info.get('description') diff --git a/plugin/yt-dlp/yt_dlp/extractor/livestream.py b/plugin/yt-dlp/yt_dlp/extractor/livestream.py index e32511b..c43cb1d 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/livestream.py +++ b/plugin/yt-dlp/yt_dlp/extractor/livestream.py @@ -1,33 +1,36 @@ -import re import itertools +import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str, compat_urlparse from ..utils import ( + determine_ext, find_xpath_attr, - xpath_attr, - xpath_with_ns, - xpath_text, - orderedSet, - update_url_query, - int_or_none, float_or_none, + int_or_none, + orderedSet, parse_iso8601, - determine_ext, + traverse_obj, + update_url_query, + xpath_attr, + xpath_text, + xpath_with_ns, ) class LivestreamIE(InfoExtractor): IE_NAME = 'livestream' - _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P\d+)|(?P[^/]+))/(?:events/(?P\d+)|(?P[^/]+))(?:/videos/(?P\d+))?' + _VALID_URL = r'''(?x) + https?://(?:new\.)?livestream\.com/ + (?:accounts/(?P\d+)|(?P[^/]+)) + (?:/events/(?P\d+)|/(?P[^/]+))? + (?:/videos/(?P\d+))? + ''' _EMBED_REGEX = [r']+src="(?Phttps?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"'] _TESTS = [{ 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', - 'md5': '53274c76ba7754fb0e8d072716f2292b', + 'md5': '7876c5f5dc3e711b6b73acce4aac1527', 'info_dict': { 'id': '4719370', 'ext': 'mp4', @@ -37,22 +40,37 @@ class LivestreamIE(InfoExtractor): 'duration': 5968.0, 'like_count': int, 'view_count': int, + 'comment_count': int, 'thumbnail': r're:^http://.*\.jpg$' } }, { - 'url': 'http://new.livestream.com/tedx/cityenglish', + 'url': 'https://livestream.com/coheedandcambria/websterhall', 'info_dict': { - 'title': 'TEDCity2.0 (English)', - 'id': '2245590', + 'id': '1585861', + 'title': 'Live From Webster Hall' }, - 'playlist_mincount': 4, + 'playlist_mincount': 1, }, { - 'url': 'http://new.livestream.com/chess24/tatasteelchess', + 'url': 'https://livestream.com/dayananda/events/7954027', 'info_dict': { - 'title': 'Tata Steel Chess', - 'id': '3705884', + 'title': 'Live from Mevo', + 'id': '7954027', }, - 'playlist_mincount': 60, + 'playlist_mincount': 4, + }, { + 'url': 'https://livestream.com/accounts/82', + 'info_dict': { + 'id': '253978', + 'view_count': int, + 'title': 'trsr', + 'comment_count': int, + 'like_count': int, + 'upload_date': '20120306', + 'timestamp': 1331042383, + 'thumbnail': 'http://img.new.livestream.com/videos/0000000000000372/cacbeed6-fb68-4b5e-ad9c-e148124e68a9_640x427.jpg', + 'duration': 15.332, + 'ext': 'mp4' + } }, { 'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640', 'only_matching': True, @@ -179,7 +197,7 @@ def _extract_stream_info(self, stream_info): 'is_live': is_live, } - def _extract_event(self, event_data): + def _generate_event_playlist(self, event_data): event_id = compat_str(event_data['id']) account_id = compat_str(event_data['owner_account_id']) feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json' @@ -189,7 +207,6 @@ def _extract_event(self, event_data): return self._extract_stream_info(stream_info) last_video = None - entries = [] for i in itertools.count(1): if last_video is None: info_url = feed_root_url @@ -197,31 +214,38 @@ def _extract_event(self, event_data): info_url = '{root}?&id={id}&newer=-1&type=video'.format( root=feed_root_url, id=last_video) videos_info = self._download_json( - info_url, event_id, 'Downloading page {0}'.format(i))['data'] + info_url, event_id, f'Downloading page {i}')['data'] videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] if not videos_info: break for v in videos_info: v_id = compat_str(v['id']) - entries.append(self.url_result( - 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id), - 'Livestream', v_id, v.get('caption'))) + yield self.url_result( + f'http://livestream.com/accounts/{account_id}/events/{event_id}/videos/{v_id}', + LivestreamIE, v_id, v.get('caption')) last_video = videos_info[-1]['id'] - return self.playlist_result(entries, event_id, event_data['full_name']) def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') event = mobj.group('event_id') or mobj.group('event_name') account = mobj.group('account_id') or mobj.group('account_name') - api_url = self._API_URL_TEMPLATE % (account, event) + api_url = f'http://livestream.com/api/accounts/{account}' + if video_id: video_data = self._download_json( - api_url + '/videos/%s' % video_id, video_id) + f'{api_url}/events/{event}/videos/{video_id}', video_id) return self._extract_video_info(video_data) - else: - event_data = self._download_json(api_url, video_id) - return self._extract_event(event_data) + elif event: + event_data = self._download_json(f'{api_url}/events/{event}', None) + return self.playlist_result( + self._generate_event_playlist(event_data), str(event_data['id']), event_data['full_name']) + + account_data = self._download_json(api_url, None) + items = traverse_obj(account_data, (('upcoming_events', 'past_events'), 'data', ...)) + return self.playlist_result( + itertools.chain.from_iterable(map(self._generate_event_playlist, items)), + account_data.get('id'), account_data.get('full_name')) # The original version of Livestream uses a different system diff --git a/plugin/yt-dlp/yt_dlp/extractor/mgtv.py b/plugin/yt-dlp/yt_dlp/extractor/mgtv.py index 1b65dde..17803f1 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/mgtv.py +++ b/plugin/yt-dlp/yt_dlp/extractor/mgtv.py @@ -1,17 +1,17 @@ import base64 import time +import urllib.error import uuid from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) from ..utils import ( ExtractorError, int_or_none, + parse_resolution, + traverse_obj, try_get, url_or_none, + urljoin, ) @@ -30,16 +30,18 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15588271.html', 'info_dict': { 'id': '15588271', 'ext': 'mp4', - 'title': '春日迟迟再出发 沉浸版', + 'title': '春日迟迟再出发 沉浸版第1期:陆莹结婚半年查出肾炎被离婚 吴雅婷把一半票根退给前夫', 'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6', 'thumbnail': r're:^https?://.+\.jpg', 'duration': 4026, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/333652/7329822.html', 'info_dict': { @@ -50,6 +52,7 @@ class MGTVIE(InfoExtractor): 'thumbnail': r're:^https?://.+\.jpg', 'duration': 2656, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15591647.html', 'only_matching': True, @@ -64,6 +67,13 @@ class MGTVIE(InfoExtractor): 'only_matching': True, }] + _RESOLUTIONS = { + '标清': ('480p', '854x480'), + '高清': ('540p', '960x540'), + '超清': ('720p', '1280x720'), + '蓝光': ('1080p', '1920x1080'), + } + def _real_extract(self, url): video_id = self._match_id(url) tk2 = base64.urlsafe_b64encode( @@ -76,55 +86,60 @@ def _real_extract(self, url): 'type': 'pch5' }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: error = self._parse_json(e.cause.read().decode(), None) if error.get('code') == 40005: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(error['msg'], expected=True) raise - info = api_data['info'] - title = info['title'].strip() + stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ - 'pm2': api_data['atc']['pm2'], 'tk2': tk2, + 'pm2': api_data['atc']['pm2'], 'video_id': video_id, + 'type': 'pch5', 'src': 'intelmgtv', }, headers=self.geo_verification_headers())['data'] - stream_domain = stream_data['stream_domain'][0] + stream_domain = traverse_obj(stream_data, ('stream_domain', ..., {url_or_none}), get_all=False) formats = [] - for idx, stream in enumerate(stream_data['stream']): - stream_path = stream.get('url') - if not stream_path: - continue - format_data = self._download_json( - stream_domain + stream_path, video_id, - note=f'Download video info for format #{idx}') - format_url = format_data.get('info') + for idx, stream in enumerate(traverse_obj(stream_data, ('stream', lambda _, v: v['url']))): + stream_name = traverse_obj(stream, 'name', 'standardName', 'barName', expected_type=str) + resolution = traverse_obj( + self._RESOLUTIONS, (stream_name, 1 if stream.get('scale') == '16:9' else 0)) + format_url = traverse_obj(self._download_json( + urljoin(stream_domain, stream['url']), video_id, fatal=False, + note=f'Downloading video info for format {resolution or stream_name}'), + ('info', {url_or_none})) if not format_url: continue tbr = int_or_none(stream.get('filebitrate') or self._search_regex( r'_(\d+)_mp4/', format_url, 'tbr', default=None)) formats.append({ - 'format_id': compat_str(tbr or idx), - 'url': url_or_none(format_url), + 'format_id': str(tbr or idx), + 'url': format_url, 'ext': 'mp4', 'tbr': tbr, + 'vcodec': stream.get('videoFormat'), + 'acodec': stream.get('audioFormat'), + **parse_resolution(resolution), 'protocol': 'm3u8_native', 'http_headers': { 'Referer': url, }, - 'format_note': stream.get('name'), + 'format_note': stream_name, }) return { 'id': video_id, - 'title': title, 'formats': formats, - 'description': info.get('desc'), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('thumb'), + **traverse_obj(api_data, ('info', { + 'title': ('title', {str.strip}), + 'description': ('desc', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('thumb', {url_or_none}), + })), 'subtitles': self.extract_subtitles(video_id, stream_domain), } diff --git a/plugin/yt-dlp/yt_dlp/extractor/mzaalo.py b/plugin/yt-dlp/yt_dlp/extractor/mzaalo.py new file mode 100644 index 0000000..c6f420c --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/mzaalo.py @@ -0,0 +1,92 @@ +from .common import InfoExtractor +from ..utils import ( + parse_age_limit, + parse_duration, + traverse_obj, + url_or_none, +) + + +class MzaaloIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mzaalo\.com/play/(?Pmovie|original|clip)/(?P[a-fA-F0-9-]+)/[\w-]+' + _TESTS = [{ + # Movies + 'url': 'https://www.mzaalo.com/play/movie/c0958d9f-f90e-4503-a755-44358758921d/Jamun', + 'info_dict': { + 'id': 'c0958d9f-f90e-4503-a755-44358758921d', + 'title': 'Jamun', + 'ext': 'mp4', + 'description': 'md5:24fe9ebb9bbe5b36f7b54b90ab1e2f31', + 'thumbnails': 'count:15', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5527.0, + 'language': 'hin', + 'categories': ['Drama'], + 'age_limit': 13, + }, + 'params': {'skip_download': 'm3u8'} + }, { + # Shows + 'url': 'https://www.mzaalo.com/play/original/93d42b2b-f373-4c2d-bca4-997412cb069d/Modi-Season-2-CM-TO-PM/Episode-1:Decision,-Not-Promises', + 'info_dict': { + 'id': '93d42b2b-f373-4c2d-bca4-997412cb069d', + 'title': 'Episode 1:Decision, Not Promises', + 'ext': 'mp4', + 'description': 'md5:16f76058432a54774fbb2561a1955652', + 'thumbnails': 'count:22', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2040.0, + 'language': 'hin', + 'categories': ['Drama'], + 'age_limit': 13, + }, + 'params': {'skip_download': 'm3u8'} + }, { + # Streams/Clips + 'url': 'https://www.mzaalo.com/play/clip/83cdbcb5-400a-42f1-a1d2-459053cfbda5/Manto-Ki-Kahaaniya', + 'info_dict': { + 'id': '83cdbcb5-400a-42f1-a1d2-459053cfbda5', + 'title': 'Manto Ki Kahaaniya', + 'ext': 'mp4', + 'description': 'md5:c3c5f1d05f0fd1bfcb05b673d1cc9f2f', + 'thumbnails': 'count:3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1937.0, + 'language': 'hin', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id, type_ = self._match_valid_url(url).group('id', 'type') + path = (f'partner/streamurl?&assetId={video_id}&getClipDetails=YES' if type_ == 'clip' + else f'api/v2/player/details?assetType={type_.upper()}&assetId={video_id}') + data = self._download_json( + f'https://production.mzaalo.com/platform/{path}', video_id, headers={ + 'Ocp-Apim-Subscription-Key': '1d0caac2702049b89a305929fdf4cbae', + })['data'] + + formats = self._extract_m3u8_formats(data['streamURL'], video_id) + + subtitles = {} + for subs_lang, subs_url in traverse_obj(data, ('subtitles', {dict.items}, ...)): + if url_or_none(subs_url): + subtitles[subs_lang] = [{'url': subs_url, 'ext': 'vtt'}] + + lang = traverse_obj(data, ('language', {str.lower})) + for f in formats: + f['language'] = lang + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {parse_duration}), + 'age_limit': ('maturity_rating', {parse_age_limit}), + 'thumbnails': ('images', ..., {'url': {url_or_none}}), + 'categories': ('genre', ..., {str}), + }), + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/naver.py b/plugin/yt-dlp/yt_dlp/extractor/naver.py index 2041530..5c8f39f 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/naver.py +++ b/plugin/yt-dlp/yt_dlp/extractor/naver.py @@ -21,7 +21,7 @@ class NaverBaseIE(InfoExtractor): _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - @staticmethod # NB: Used in VLiveWebArchiveIE + @staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE def process_subtitles(vod_data, process_url): ret = {'subtitles': {}, 'automatic_captions': {}} for caption in traverse_obj(vod_data, ('captions', 'list', ...)): diff --git a/plugin/yt-dlp/yt_dlp/extractor/nebula.py b/plugin/yt-dlp/yt_dlp/extractor/nebula.py index e0d502a..19d4690 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/nebula.py +++ b/plugin/yt-dlp/yt_dlp/extractor/nebula.py @@ -3,7 +3,7 @@ import urllib.error from .common import InfoExtractor -from ..utils import ExtractorError, parse_iso8601 +from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' @@ -65,19 +65,20 @@ def _fetch_nebula_bearer_token(self): return response['token'] def _fetch_video_formats(self, slug): - stream_info = self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/stream/', + stream_info = self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/stream/', video_id=slug, auth_type='bearer', note='Fetching video stream info') manifest_url = stream_info['manifest'] - return self._extract_m3u8_formats_and_subtitles(manifest_url, slug) + return self._extract_m3u8_formats_and_subtitles(manifest_url, slug, 'mp4') def _build_video_info(self, episode): fmts, subs = self._fetch_video_formats(episode['slug']) channel_slug = episode['channel_slug'] channel_title = episode['channel_title'] + zype_id = episode.get('zype_id') return { - 'id': episode['zype_id'], + 'id': remove_start(episode['id'], 'video_episode:'), 'display_id': episode['slug'], 'formats': fmts, 'subtitles': subs, @@ -99,6 +100,9 @@ def _build_video_info(self, episode): 'uploader_url': f'https://nebula.tv/{channel_slug}', 'series': channel_title, 'creator': channel_title, + 'extractor_key': NebulaIE.ie_key(), + 'extractor': NebulaIE.IE_NAME, + '_old_archive_ids': [make_archive_id(NebulaIE, zype_id)] if zype_id else None, } def _perform_login(self, username=None, password=None): @@ -113,7 +117,7 @@ class NebulaIE(NebulaBaseIE): 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', 'md5': '14944cfee8c7beeea106320c47560efc', 'info_dict': { - 'id': '5c271b40b13fd613090034fd', + 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', 'ext': 'mp4', 'title': 'That Time Disney Remade Beauty and the Beast', 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', @@ -137,22 +141,22 @@ class NebulaIE(NebulaBaseIE): 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', 'md5': 'd05739cf6c38c09322422f696b569c23', 'info_dict': { - 'id': '5e7e78171aaf320001fbd6be', + 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34', 'ext': 'mp4', 'title': 'Landing Craft - How The Allies Got Ashore', 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', 'upload_date': '20200327', 'timestamp': 1585348140, - 'channel': 'Real Engineering', - 'channel_id': 'realengineering', - 'uploader': 'Real Engineering', - 'uploader_id': 'realengineering', - 'series': 'Real Engineering', + 'channel': 'Real Engineering — The Logistics of D-Day', + 'channel_id': 'd-day', + 'uploader': 'Real Engineering — The Logistics of D-Day', + 'uploader_id': 'd-day', + 'series': 'Real Engineering — The Logistics of D-Day', 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'creator': 'Real Engineering', + 'creator': 'Real Engineering — The Logistics of D-Day', 'duration': 841, - 'channel_url': 'https://nebula.tv/realengineering', - 'uploader_url': 'https://nebula.tv/realengineering', + 'channel_url': 'https://nebula.tv/d-day', + 'uploader_url': 'https://nebula.tv/d-day', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', }, }, @@ -160,7 +164,7 @@ class NebulaIE(NebulaBaseIE): 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', 'md5': 'ebe28a7ad822b9ee172387d860487868', 'info_dict': { - 'id': '5e779ebdd157bc0001d1c75a', + 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553', 'ext': 'mp4', 'title': 'Episode 1: The Draw', 'description': r'contains:There’s free money on offer… if the players can all work together.', @@ -190,7 +194,7 @@ class NebulaIE(NebulaBaseIE): ] def _fetch_video_metadata(self, slug): - return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', + return self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/', video_id=slug, auth_type='bearer', note='Fetching video meta data') diff --git a/plugin/yt-dlp/yt_dlp/extractor/nekohacker.py b/plugin/yt-dlp/yt_dlp/extractor/nekohacker.py new file mode 100644 index 0000000..e10ffe9 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/nekohacker.py @@ -0,0 +1,217 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + extract_attributes, + get_element_by_class, + get_element_text_and_html_by_tag, + parse_duration, + traverse_obj, + try_call, + url_or_none, +) + + +class NekoHackerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nekohacker\.com/(?P(?!free-dl)[\w-]+)' + _TESTS = [{ + 'url': 'https://nekohacker.com/nekoverse/', + 'info_dict': { + 'id': 'nekoverse', + 'title': 'Nekoverse', + }, + 'playlist': [ + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/01-Spaceship.mp3', + 'md5': '44223701ebedba0467ebda4cc07fb3aa', + 'info_dict': { + 'id': '1712', + 'ext': 'mp3', + 'title': 'Spaceship', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Spaceship', + 'track_number': 1, + 'duration': 195.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/02-City-Runner.mp3', + 'md5': '8f853c71719389d32bbbd3f1a87b3f08', + 'info_dict': { + 'id': '1713', + 'ext': 'mp3', + 'title': 'City Runner', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'City Runner', + 'track_number': 2, + 'duration': 148.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/03-Nature-Talk.mp3', + 'md5': '5a8a8ae852720cee4c0ac95c7d1a7450', + 'info_dict': { + 'id': '1714', + 'ext': 'mp3', + 'title': 'Nature Talk', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Nature Talk', + 'track_number': 3, + 'duration': 174.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/04-Crystal-World.mp3', + 'md5': 'd8e59a48061764e50d92386a294abd50', + 'info_dict': { + 'id': '1715', + 'ext': 'mp3', + 'title': 'Crystal World', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Crystal World', + 'track_number': 4, + 'duration': 199.0 + } + } + ] + }, { + 'url': 'https://nekohacker.com/susume/', + 'info_dict': { + 'id': 'susume', + 'title': '進め!むじなカンパニー', + }, + 'playlist': [ + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-feat.-六科なじむ-CV_-日高里菜-割戶真友-CV_-金元寿子-軽井沢ユキ-CV_-上坂すみれ-出稼ぎガルシア-CV_-金子彩花-.mp3', + 'md5': 'fb13f008aa81f26ba48f91fd2d6186ce', + 'info_dict': { + 'id': '711', + 'ext': 'mp3', + 'title': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', + 'track_number': 1, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-feat.-六科なじむ-CV_-日高里菜-.mp3', + 'md5': '028803f70241df512b7764e73396fdd1', + 'info_dict': { + 'id': '709', + 'ext': 'mp3', + 'title': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', + 'track_number': 2, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-instrumental.mp3', + 'md5': 'adde9e9a16e1da5e602b579c247d0fb9', + 'info_dict': { + 'id': '710', + 'ext': 'mp3', + 'title': '進め!むじなカンパニー (instrumental)', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': '進め!むじなカンパニー (instrumental)', + 'track_number': 3, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-instrumental.mp3', + 'md5': 'ebb0443039cf5f9ff7fd557ed9b23599', + 'info_dict': { + 'id': '712', + 'ext': 'mp3', + 'title': 'むじな de なじむ (instrumental)', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'むじな de なじむ (instrumental)', + 'track_number': 4, + 'duration': None + } + } + ] + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + playlist = get_element_by_class('playlist', webpage) + + if not playlist: + iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or '' + iframe_src = url_or_none(extract_attributes(iframe).get('src')) + if not iframe_src: + raise ExtractorError('No playlist or embed found in webpage') + elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src): + raise ExtractorError('Spotify embeds are not supported', expected=True) + return self.url_result(url, 'Generic') + + entries = [] + for track_number, track in enumerate(re.findall(r'(]+data-audiopath[^>]+>)', playlist), 1): + entry = traverse_obj(extract_attributes(track), { + 'url': ('data-audiopath', {url_or_none}), + 'ext': ('data-audiopath', {determine_ext}), + 'id': 'data-trackid', + 'title': 'data-tracktitle', + 'track': 'data-tracktitle', + 'album': 'data-albumtitle', + 'duration': ('data-tracktime', {parse_duration}), + 'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), + 'thumbnail': ('data-albumart', {url_or_none}), + }) + entries.append({ + **entry, + 'track_number': track_number, + 'artist': 'Neko Hacker', + 'vcodec': 'none', + 'acodec': 'mp3' if entry['ext'] == 'mp3' else None, + }) + + return self.playlist_result(entries, playlist_id, traverse_obj(entries, (0, 'album'))) diff --git a/plugin/yt-dlp/yt_dlp/extractor/nhk.py b/plugin/yt-dlp/yt_dlp/extractor/nhk.py index 1711957..d31398a 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/nhk.py +++ b/plugin/yt-dlp/yt_dlp/extractor/nhk.py @@ -2,12 +2,15 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, + int_or_none, + join_nonempty, parse_duration, traverse_obj, unescapeHTML, unified_timestamp, + url_or_none, urljoin, - url_or_none ) @@ -67,7 +70,7 @@ def get_clean_field(key): info.update({ '_type': 'url_transparent', 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, + 'url': 'https://movie-s.nhk.or.jp/v/refid/nhkworld/prefid/' + vod_id, 'id': vod_id, }) else: @@ -94,6 +97,19 @@ class NhkVodIE(NhkBaseIE): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/', + 'info_dict': { + 'id': 'yd8322ch', + 'ext': 'mp4', + 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898', + 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)', + 'upload_date': '20230514', + 'timestamp': 1684083791, + 'series': 'GRAND SUMO Highlights', + 'episode': '[Recap] May Tournament Day 1 (Opening Day)', + 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080', + }, + }, { # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', @@ -104,6 +120,9 @@ class NhkVodIE(NhkBaseIE): 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', 'timestamp': 1565965194, 'upload_date': '20190816', + 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080', + 'series': 'Dining with the Chef', + 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', }, }, { # audio clip @@ -114,10 +133,7 @@ class NhkVodIE(NhkBaseIE): 'title': "Japan's Top Inventions - Miniature Video Cameras", 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': '404 Not Found', }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, @@ -133,7 +149,6 @@ class NhkVodIE(NhkBaseIE): }, { # video, alphabetic character in ID #29670 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', - 'only_matching': True, 'info_dict': { 'id': 'qfjay6cg', 'ext': 'mp4', @@ -142,7 +157,8 @@ class NhkVodIE(NhkBaseIE): 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', 'upload_date': '20210615', 'timestamp': 1623722008, - } + }, + 'skip': '404 Not Found', }] def _real_extract(self, url): @@ -153,12 +169,19 @@ class NhkVodProgramIE(NhkBaseIE): _VALID_URL = r'%s/program%s(?P[0-9a-z]+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) _TESTS = [{ # video program episodes + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', + 'info_dict': { + 'id': 'sumo', + 'title': 'GRAND SUMO Highlights', + }, + 'playlist_mincount': 12, + }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', }, - 'playlist_mincount': 1, + 'playlist_mincount': 12, }, { # video program clips 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', @@ -472,3 +495,73 @@ class NhkRadioNewsPageIE(InfoExtractor): def _real_extract(self, url): return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE) + + +class NhkRadiruLiveIE(InfoExtractor): + _GEO_COUNTRIES = ['JP'] + _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?Pr[12]|fm)' + _TESTS = [{ + # radio 1, no area specified + 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1', + 'info_dict': { + 'id': 'r1-tokyo', + 'title': 're:^NHKネットラジオ第1 東京.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png', + 'live_status': 'is_live', + }, + }, { + # radio 2, area specified + # (the area doesnt actually matter, r2 is national) + 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2', + 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}}}, + 'info_dict': { + 'id': 'r2-fukuoka', + 'title': 're:^NHKネットラジオ第2 福岡.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png', + 'live_status': 'is_live', + }, + }, { + # fm, area specified + 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm', + 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}}}, + 'info_dict': { + 'id': 'fm-sapporo', + 'title': 're:^NHKネットラジオFM 札幌.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png', + 'live_status': 'is_live', + } + }] + + _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'} + + def _real_extract(self, url): + station = self._match_id(url) + area = self._configuration_arg('area', ['tokyo'])[0] + + config = self._download_xml( + 'https://www.nhk.or.jp/radio/config/config_web.xml', station, 'Downloading area information') + data = config.find(f'.//data//area[.="{area}"]/..') + + if not data: + raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join( + [i.text for i in config.findall('.//data//area')]), expected=True) + + noa_info = self._download_json( + f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text), + station, note=f'Downloading {area} station metadata') + present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present')) + + return { + 'title': ' '.join(traverse_obj(present_info, (('service', 'area',), 'name', {str}))), + 'id': join_nonempty(station, area), + 'thumbnails': traverse_obj(present_info, ('service', 'images', ..., { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + })), + 'formats': self._extract_m3u8_formats(data.find(f'{station}hls').text, station), + 'is_live': True, + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/niconico.py b/plugin/yt-dlp/yt_dlp/extractor/niconico.py index f67304d..fc11e2d 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/niconico.py +++ b/plugin/yt-dlp/yt_dlp/extractor/niconico.py @@ -5,13 +5,17 @@ import re import time +from urllib.parse import urlparse + from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_HTTPError, ) +from ..dependencies import websockets from ..utils import ( ExtractorError, OnDemandPagedList, + WebSocketsWrapper, bug_reports_message, clean_html, float_or_none, @@ -895,3 +899,162 @@ def _entries(self, list_id): def _real_extract(self, url): list_id = self._match_id(url) return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key()) + + +class NiconicoLiveIE(InfoExtractor): + IE_NAME = 'niconico:live' + IE_DESC = 'ニコニコ生放送' + _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?Plv\d+)' + _TESTS = [{ + 'note': 'this test case includes invisible characters for title, pasting them as-is', + 'url': 'https://live.nicovideo.jp/watch/lv339533123', + 'info_dict': { + 'id': 'lv339533123', + 'title': '激辛ペヤング食べます‪( ;ᯅ; )‬(歌枠オーディション参加中)', + 'view_count': 1526, + 'comment_count': 1772, + 'description': '初めましてもかって言います❕\nのんびり自由に適当に暮らしてます', + 'uploader': 'もか', + 'channel': 'ゲストさんのコミュニティ', + 'channel_id': 'co5776900', + 'channel_url': 'https://com.nicovideo.jp/community/co5776900', + 'timestamp': 1670677328, + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://live2.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }, { + 'url': 'https://sp.live.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }, { + 'url': 'https://sp.live2.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }] + + _KNOWN_LATENCY = ('high', 'low') + + def _real_extract(self, url): + if not websockets: + raise ExtractorError('websockets library is not available. Please install it.', expected=True) + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) + + embedded_data = self._parse_json(unescapeHTML(self._search_regex( + r' 100: + recv = recv[:100] + '...' + self.write_debug('Server said: %s' % recv) + + title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta( + ('og:title', 'twitter:title'), webpage, 'live title', fatal=False) + + raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail')) or {} + thumbnails = [] + for name, value in raw_thumbs.items(): + if not isinstance(value, dict): + thumbnails.append({ + 'id': name, + 'url': value, + **parse_resolution(value, lenient=True), + }) + continue + + for k, img_url in value.items(): + res = parse_resolution(k, lenient=True) or parse_resolution(img_url, lenient=True) + width, height = res.get('width'), res.get('height') + + thumbnails.append({ + 'id': f'{name}_{width}x{height}', + 'url': img_url, + **res, + }) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) + for fmt, q in zip(formats, reversed(qualities[1:])): + fmt.update({ + 'format_id': q, + 'protocol': 'niconico_live', + 'ws': ws, + 'video_id': video_id, + 'cookies': cookies, + 'live_latency': latency, + 'origin': hostname, + }) + + return { + 'id': video_id, + 'title': title, + **traverse_obj(embedded_data, { + 'view_count': ('program', 'statistics', 'watchCount'), + 'comment_count': ('program', 'statistics', 'commentCount'), + 'uploader': ('program', 'supplier', 'name'), + 'channel': ('socialGroup', 'name'), + 'channel_id': ('socialGroup', 'id'), + 'channel_url': ('socialGroup', 'socialGroupPageUrl'), + }), + 'description': clean_html(traverse_obj(embedded_data, ('program', 'description'))), + 'timestamp': int_or_none(traverse_obj(embedded_data, ('program', 'openTime'))), + 'is_live': True, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/odnoklassniki.py b/plugin/yt-dlp/yt_dlp/extractor/odnoklassniki.py index 5b8009b..07e49ec 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/odnoklassniki.py +++ b/plugin/yt-dlp/yt_dlp/extractor/odnoklassniki.py @@ -1,3 +1,5 @@ +import urllib.parse + from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, @@ -7,6 +9,7 @@ ) from ..utils import ( ExtractorError, + HEADRequest, float_or_none, int_or_none, qualities, @@ -15,6 +18,7 @@ unescapeHTML, unified_strdate, unsmuggle_url, + url_or_none, urlencode_postdata, ) @@ -41,7 +45,7 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1545580896, 'view_count': int, - 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'title': 'Народная забава', 'uploader': 'Nevata', 'upload_date': '20181223', @@ -65,13 +69,14 @@ class OdnoklassnikiIE(InfoExtractor): 'title': str, 'uploader': str, }, + 'skip': 'vk extractor error', }, { - # metadata in JSON + # metadata in JSON, webm_dash with Firefox UA 'url': 'http://ok.ru/video/20079905452', - 'md5': '5d2b64756e2af296e3b383a0bc02a6aa', + 'md5': '8f477d8931c531374a3e36daec617b2c', 'info_dict': { 'id': '20079905452', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Культура меняет нас (прекрасный ролик!))', 'thumbnail': str, 'duration': 100, @@ -81,10 +86,14 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, + 'params': { + 'format': 'bv[ext=webm]', + 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'}, + }, }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': 'f8c951122516af72e6e6ffdd3c41103b', + 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3', 'info_dict': { 'id': '63567059965189-0', 'ext': 'mp4', @@ -98,10 +107,11 @@ class OdnoklassnikiIE(InfoExtractor): 'age_limit': 0, 'start_time': 5, }, + 'params': {'skip_download': 'm3u8'}, }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) 'url': 'https://ok.ru/video/3952212382174', - 'md5': '91749d0bd20763a28d083fa335bbd37a', + 'md5': '5fb5f83ce16cb212d6bf887282b5da53', 'info_dict': { 'id': '5axVgHHDBvU', 'ext': 'mp4', @@ -116,7 +126,7 @@ class OdnoklassnikiIE(InfoExtractor): 'live_status': 'not_live', 'view_count': int, 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8', - 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94', + 'uploader_url': 'https://www.youtube.com/@MrKewlkid94', 'channel_follower_count': int, 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'], 'channel_id': 'UCVGtvURtEURYHtJFUegdSug', @@ -145,7 +155,6 @@ class OdnoklassnikiIE(InfoExtractor): }, 'skip': 'Video has not been found', }, { - # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading 'note': 'Only available in mobile webpage', 'url': 'https://m.ok.ru/video/2361249957145', 'info_dict': { @@ -153,8 +162,8 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Быковское крещение', 'duration': 3038.181, + 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+', }, - 'skip': 'HTTP Error 400', }, { 'note': 'subtitles', 'url': 'https://ok.ru/video/4249587550747', @@ -226,6 +235,12 @@ class OdnoklassnikiIE(InfoExtractor): 'skip': 'Site no longer embeds', }] + def _clear_cookies(self, cdn_url): + # Direct http downloads will fail if CDN cookies are set + # so we need to reset them after each format extraction + self.cookiejar.clear(domain='.mycdn.me') + self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname) + @classmethod def _extract_embed_urls(cls, url, webpage): for x in super()._extract_embed_urls(url, webpage): @@ -364,14 +379,22 @@ def _extract_desktop(self, url): formats = [{ 'url': f['url'], 'ext': 'mp4', - 'format_id': f['name'], - } for f in metadata['videos']] + 'format_id': f.get('name'), + } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))] - m3u8_url = metadata.get('hlsManifestUrl') + m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) + + for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]: + mpd_url = metadata.get(mpd_key) + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, video_id, mpd_id=mpd_id, fatal=False)) + self._clear_cookies(mpd_url) dash_manifest = metadata.get('metadataEmbedded') if dash_manifest: @@ -390,6 +413,7 @@ def _extract_desktop(self, url): if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) rtmp_url = metadata.get('rtmpUrl') if rtmp_url: formats.append({ @@ -423,6 +447,10 @@ def _extract_mobile(self, url): r'data-video="(.+?)"', webpage, 'json data') json_data = self._parse_json(unescapeHTML(json_data), video_id) or {} + redirect_url = self._request_webpage(HEADRequest( + json_data['videoSrc']), video_id, 'Requesting download URL').geturl() + self._clear_cookies(redirect_url) + return { 'id': video_id, 'title': json_data.get('videoName'), @@ -430,7 +458,7 @@ def _extract_mobile(self, url): 'thumbnail': json_data.get('videoPosterSrc'), 'formats': [{ 'format_id': 'mobile', - 'url': json_data.get('videoSrc'), + 'url': redirect_url, 'ext': 'mp4', }] } diff --git a/plugin/yt-dlp/yt_dlp/extractor/owncloud.py b/plugin/yt-dlp/yt_dlp/extractor/owncloud.py new file mode 100644 index 0000000..e1d5682 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/owncloud.py @@ -0,0 +1,80 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + url_or_none, + urlencode_postdata, +) + + +class OwnCloudIE(InfoExtractor): + _INSTANCES_RE = '|'.join(( + r'(?:[^\.]+\.)?sciebo\.de', + r'cloud\.uni-koblenz-landau\.de', + )) + _VALID_URL = rf'https?://(?:{_INSTANCES_RE})/s/(?P[\w.-]+)' + + _TESTS = [ + { + 'url': 'https://ruhr-uni-bochum.sciebo.de/s/wWhqZzh9jTumVFN', + 'info_dict': { + 'id': 'wWhqZzh9jTumVFN', + 'ext': 'mp4', + 'title': 'CmvpJST.mp4', + }, + }, + { + 'url': 'https://ruhr-uni-bochum.sciebo.de/s/WNDuFu0XuFtmm3f', + 'info_dict': { + 'id': 'WNDuFu0XuFtmm3f', + 'ext': 'mp4', + 'title': 'CmvpJST.mp4', + }, + 'params': { + 'videopassword': '12345', + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(url, video_id) + + if re.search(r']+for="password"', webpage): + webpage = self._verify_video_password(webpage, urlh.geturl(), video_id) + + hidden_inputs = self._hidden_inputs(webpage) + title = hidden_inputs.get('filename') + parsed_url = urllib.parse.urlparse(url) + + return { + 'id': video_id, + 'title': title, + 'url': url_or_none(hidden_inputs.get('downloadURL')) or parsed_url._replace( + path=urllib.parse.urljoin(parsed_url.path, 'download')).geturl(), + 'ext': determine_ext(title), + } + + def _verify_video_password(self, webpage, url, video_id): + password = self.get_param('videopassword') + if password is None: + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + + validation_response = self._download_webpage( + url, video_id, 'Validating Password', 'Wrong password?', + data=urlencode_postdata({ + 'requesttoken': self._hidden_inputs(webpage)['requesttoken'], + 'password': password, + })) + + if re.search(r']+for="password"', validation_response): + warning = self._search_regex( + r']+class="warning">([^<]*)', validation_response, + 'warning', default='The password is wrong') + raise ExtractorError(f'Opening the video failed, {self.IE_NAME} said: {warning!r}', expected=True) + return validation_response diff --git a/plugin/yt-dlp/yt_dlp/extractor/piksel.py b/plugin/yt-dlp/yt_dlp/extractor/piksel.py index 45031e9..6f73da1 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/piksel.py +++ b/plugin/yt-dlp/yt_dlp/extractor/piksel.py @@ -7,8 +7,10 @@ int_or_none, join_nonempty, parse_iso8601, + traverse_obj, try_get, unescapeHTML, + urljoin, ) @@ -63,11 +65,11 @@ class PikselIE(InfoExtractor): } ] - def _call_api(self, app_token, resource, display_id, query, fatal=True): - response = (self._download_json( - 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), - display_id, query=query, fatal=fatal) or {}).get('response') - failure = try_get(response, lambda x: x['failure']['reason']) + def _call_api(self, app_token, resource, display_id, query, host='https://player.piksel.com', fatal=True): + url = urljoin(host, f'/ws/ws_{resource}/api/{app_token}/mode/json/apiv/5') + response = traverse_obj( + self._download_json(url, display_id, query=query, fatal=fatal), ('response', {dict})) or {} + failure = traverse_obj(response, ('failure', 'reason')) if response else 'Empty response from API' if failure: if fatal: raise ExtractorError(failure, expected=True) @@ -83,7 +85,7 @@ def _real_extract(self, url): ], webpage, 'app token') query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} program = self._call_api( - app_token, 'program', display_id, query)['WsProgramResponse']['program'] + app_token, 'program', display_id, query, url)['WsProgramResponse']['program'] video_id = program['uuid'] video_data = program['asset'] title = video_data['title'] @@ -129,7 +131,7 @@ def process_asset_files(asset_files): process_asset_files(try_get(self._call_api( app_token, 'asset_file', display_id, { 'assetid': asset_id, - }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + }, url, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) m3u8_url = dict_get(video_data, [ 'm3u8iPadURL', diff --git a/plugin/yt-dlp/yt_dlp/extractor/playsuisse.py b/plugin/yt-dlp/yt_dlp/extractor/playsuisse.py index 989ce00..9cad3b7 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/playsuisse.py +++ b/plugin/yt-dlp/yt_dlp/extractor/playsuisse.py @@ -5,10 +5,16 @@ class PlaySuisseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/watch/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P[0-9]+)' _TESTS = [ { + # Old URL 'url': 'https://www.playsuisse.ch/watch/763211/0', + 'only_matching': True, + }, + { + # episode in a series + 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', 'info_dict': { 'id': '763211', @@ -21,11 +27,11 @@ class PlaySuisseIE(InfoExtractor): 'season_number': 1, 'episode': 'Knochen', 'episode_number': 1, - 'thumbnail': 'md5:9260abe0c0ec9b69914d0a10d54c5878' + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', } - }, - { - 'url': 'https://www.playsuisse.ch/watch/808675/0', + }, { + # film + 'url': 'https://www.playsuisse.ch/watch/808675', 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', 'info_dict': { 'id': '808675', @@ -33,26 +39,60 @@ class PlaySuisseIE(InfoExtractor): 'title': 'Der Läufer', 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', 'duration': 5280, - 'episode': 'Der Läufer', - 'thumbnail': 'md5:44af7d65ee02bbba4576b131868bb783' + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', } - }, - { - 'url': 'https://www.playsuisse.ch/watch/817193/0', - 'md5': '1d6c066f92cd7fffd8b28a53526d6b59', + }, { + # series (treated as a playlist) + 'url': 'https://www.playsuisse.ch/detail/1115687', 'info_dict': { - 'id': '817193', - 'ext': 'mp4', - 'title': 'Die Einweihungsparty', - 'description': 'md5:91ebf04d3a42cb3ab70666acf750a930', - 'duration': 1380, - 'series': 'Nr. 47', - 'season': 'Season 1', - 'season_number': 1, - 'episode': 'Die Einweihungsparty', - 'episode_number': 1, - 'thumbnail': 'md5:637585fb106e3a4bcd991958924c7e44' - } + 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3', + 'id': '1115687', + 'series': 'They all came out to Montreux', + 'title': 'They all came out to Montreux', + }, + 'playlist': [{ + 'info_dict': { + 'description': 'md5:f2462744834b959a31adc6292380cda2', + 'duration': 3180, + 'episode': 'Folge 1', + 'episode_number': 1, + 'id': '1112663', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 1', + 'ext': 'mp4' + }, + }, { + 'info_dict': { + 'description': 'md5:9dfd308699fe850d3bce12dc1bad9b27', + 'duration': 2935, + 'episode': 'Folge 2', + 'episode_number': 2, + 'id': '1112661', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 2', + 'ext': 'mp4' + }, + }, { + 'info_dict': { + 'description': 'md5:14a93a3356b2492a8f786ab2227ef602', + 'duration': 2994, + 'episode': 'Folge 3', + 'episode_number': 3, + 'id': '1112664', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 3', + 'ext': 'mp4' + } + }], } ] @@ -142,6 +182,6 @@ def _extract_single(self, media_data): 'subtitles': subtitles, 'series': media_data.get('seriesName'), 'season_number': int_or_none(media_data.get('seasonNumber')), - 'episode': media_data.get('name'), + 'episode': media_data.get('name') if media_data.get('episodeNumber') else None, 'episode_number': int_or_none(media_data.get('episodeNumber')), } diff --git a/plugin/yt-dlp/yt_dlp/extractor/polskieradio.py b/plugin/yt-dlp/yt_dlp/extractor/polskieradio.py index 42b78a7..d267f50 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/polskieradio.py +++ b/plugin/yt-dlp/yt_dlp/extractor/polskieradio.py @@ -2,26 +2,24 @@ import json import math import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, - compat_urlparse -) +from ..compat import compat_str from ..utils import ( - determine_ext, - extract_attributes, ExtractorError, InAdvancePagedList, + determine_ext, + extract_attributes, int_or_none, js_to_json, parse_iso8601, strip_or_none, traverse_obj, - unified_timestamp, unescapeHTML, + unified_timestamp, url_or_none, + urljoin, ) @@ -44,7 +42,7 @@ def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): 'duration': int_or_none(media.get('length')), 'vcodec': 'none' if media.get('provider') == 'audio' else None, }) - entry_title = compat_urllib_parse_unquote(media['desc']) + entry_title = urllib.parse.unquote(media['desc']) if entry_title: entry['title'] = entry_title yield entry @@ -130,10 +128,11 @@ def _real_extract(self, url): return self.playlist_result(entries, playlist_id, title, description) -class PolskieRadioIE(InfoExtractor): - # new next.js sites, excluding radiokierowcow.pl - _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio(?:24)?\.pl/artykul/(?P\d+)' +class PolskieRadioIE(PolskieRadioBaseExtractor): + # new next.js sites + _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P\d+)' _TESTS = [{ + # articleData, attachments 'url': 'https://jedynka.polskieradio.pl/artykul/1587943', 'info_dict': { 'id': '1587943', @@ -148,6 +147,31 @@ class PolskieRadioIE(InfoExtractor): 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', }, }], + }, { + # post, legacy html players + 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager', + 'info_dict': { + 'id': '2589163', + 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?', + 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2577880', + 'ext': 'mp3', + 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a', + 'duration': 321, + }, + }], + }, { + # data, legacy + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0', + }, + 'playlist_count': 3, }, { 'url': 'https://trojka.polskieradio.pl/artykul/1632955', 'only_matching': True, @@ -166,7 +190,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) article_data = traverse_obj( - self._search_nextjs_data(webpage, playlist_id), ('props', 'pageProps', 'data', 'articleData')) + self._search_nextjs_data(webpage, playlist_id), ( + 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False) title = strip_or_none(article_data['title']) @@ -178,7 +203,13 @@ def _real_extract(self, url): 'id': self._search_regex( r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'), 'title': strip_or_none(entry.get('description')) or title, - } for entry in article_data.get('attachments') or () if entry['fileType'] in ('Audio', )] + } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )] + + if not entries: + # some legacy articles have no json attachments, but players in body + entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, { + 'title': title, + }) return self.playlist_result(entries, playlist_id, title, description) @@ -214,6 +245,15 @@ class PolskieRadioAuditionIE(InfoExtractor): 'thumbnail': r're:https://static\.prsa\.pl/images/.+', }, 'playlist_mincount': 722, + }, { + # some articles were "promoted to main page" and thus link to old frontend + 'url': 'https://trojka.polskieradio.pl/audycja/305', + 'info_dict': { + 'id': '305', + 'title': 'Co w mowie piszczy?', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_count': 1523, }] def _call_lp3(self, path, query, video_id, note): @@ -254,7 +294,6 @@ def _entries(self, playlist_id, has_episodes, has_articles): for article in page['data']: yield { '_type': 'url_transparent', - 'ie_key': PolskieRadioIE.ie_key(), 'id': str(article['id']), 'url': article['url'], 'title': article.get('shortTitle'), @@ -282,11 +321,8 @@ def _real_extract(self, url): class PolskieRadioCategoryIE(InfoExtractor): # legacy sites IE_NAME = 'polskieradio:category' - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P\d+)' _TESTS = [{ - 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', - 'only_matching': True - }, { 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', 'info_dict': { 'id': '4143', @@ -300,6 +336,36 @@ class PolskieRadioCategoryIE(InfoExtractor): 'title': 'Muzyka', }, 'playlist_mincount': 61 + }, { + # billennium tabs + 'url': 'https://www.polskieradio.pl/8/2385', + 'info_dict': { + 'id': '2385', + 'title': 'Droga przez mąkę', + }, + 'playlist_mincount': 111, + }, { + 'url': 'https://www.polskieradio.pl/10/4930', + 'info_dict': { + 'id': '4930', + 'title': 'Teraz K-pop!', + }, + 'playlist_mincount': 392, + }, { + # post back pages, audio content directly without articles + 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa', + 'info_dict': { + 'id': '7376', + 'title': 'Nowa mowa', + }, + 'playlist_mincount': 244, + }, { + 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458', + 'info_dict': { + 'id': '175458', + 'title': 'Krzysztof Dziuba', + }, + 'playlist_mincount': 420, }, { 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', 'only_matching': True, @@ -311,25 +377,61 @@ def suitable(cls, url): def _entries(self, url, page, category_id): content = page + is_billennium_tabs = 'onclick="TB_LoadTab(' in page + is_post_back = 'onclick="__doPostBack(' in page + pagination = page if is_billennium_tabs else None for page_num in itertools.count(2): for a_entry, entry_id in re.findall( - r'(?s)]+>.*?(]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?', + r'(?s)]+>.*?(]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?', content): entry = extract_attributes(a_entry) - href = entry.get('href') - if not href: - continue - yield self.url_result( - compat_urlparse.urljoin(url, href), PolskieRadioLegacyIE, - entry_id, entry.get('title')) - mobj = re.search( - r']+class=["\']next["\'][^>]*>\s*]+href=(["\'])(?P(?:(?!\1).)+)\1', - content) - if not mobj: - break - next_url = compat_urlparse.urljoin(url, mobj.group('url')) - content = self._download_webpage( - next_url, category_id, 'Downloading page %s' % page_num) + if entry.get('href'): + yield self.url_result( + urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title')) + for a_entry in re.findall(r']+class=["\']next["\'][^>]*>\s*]+onclick=["\']TB_LoadTab\(', + pagination, 'next page params', category_id, default=None, close_objects=1, + contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x))) + if not params: + break + tab_content = self._download_json( + 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent', + category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'}, + data=json.dumps(dict(zip(( + 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode', + 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate', + 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber' + ), params))).encode())['d'] + content, pagination = tab_content['Content'], tab_content.get('PagerContent') + elif is_post_back: + target = self._search_regex( + r'onclick=(?:["\'])__doPostBack\((?P["\'])(?P[\w$]+)(?P=q1)\s*,\s*(?P["\'])Next(?P=q2)', + content, 'pagination postback target', group='target', default=None) + if not target: + break + content = self._download_webpage( + url, category_id, f'Downloading page {page_num}', + data=urllib.parse.urlencode({ + **self._hidden_inputs(content), + '__EVENTTARGET': target, + '__EVENTARGUMENT': 'Next', + }).encode()) + else: + next_url = urljoin(url, self._search_regex( + r']+class=["\']next["\'][^>]*>\s*]+href=(["\'])(?P(?:(?!\1).)+)\1', + content, 'next page url', group='url', default=None)) + if not next_url: + break + content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}') def _real_extract(self, url): category_id = self._match_id(url) @@ -337,7 +439,7 @@ def _real_extract(self, url): if PolskieRadioAuditionIE.suitable(urlh.url): return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id) title = self._html_search_regex( - r'([^<]+) - [^<]+ - [^<]+', + r'([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)', webpage, 'title', fatal=False) return self.playlist_result( self._entries(url, webpage, category_id), @@ -506,39 +608,3 @@ def _real_extract(self, url): 'Content-Type': 'application/json', }) return self._parse_episode(data[0]) - - -class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): - _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P[0-9]+)' - IE_NAME = 'polskieradio:kierowcow' - - _TESTS = [{ - 'url': 'https://radiokierowcow.pl/artykul/2694529', - 'info_dict': { - 'id': '2694529', - 'title': 'Zielona fala reliktem przeszłości?', - 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', - }, - 'playlist_count': 3, - }] - - def _real_extract(self, url): - media_id = self._match_id(url) - webpage = self._download_webpage(url, media_id) - nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] - article = self._download_json( - f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', - media_id) - data = article['pageProps']['data'] - title = data['title'] - entries = self._extract_webpage_player_entries(data['content'], media_id, { - 'title': title, - }) - - return { - '_type': 'playlist', - 'id': media_id, - 'entries': entries, - 'title': title, - 'description': data.get('lead'), - } diff --git a/plugin/yt-dlp/yt_dlp/extractor/rai.py b/plugin/yt-dlp/yt_dlp/extractor/rai.py index 1b72967..aa422eb 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/rai.py +++ b/plugin/yt-dlp/yt_dlp/extractor/rai.py @@ -1,19 +1,12 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( clean_html, determine_ext, ExtractorError, filter_dict, - find_xpath_attr, - fix_xml_ampersands, GeoRestrictedError, - HEADRequest, int_or_none, join_nonempty, parse_duration, @@ -35,82 +28,70 @@ class RaiBaseIE(InfoExtractor): _GEO_BYPASS = False def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): + def fix_cdata(s): + # remove \r\n\t before and after to avoid + # polluted text with xpath_text + s = re.sub(r'(\]\]>)[\r\n\t]+()[\r\n\t]+( 0 else None, - 'format_id': f'http-{bitrate if bitrate > 0 else "http"}', - }) + if ext == 'mp3': + formats.append({ + 'url': media_url, + 'vcodec': 'none', + 'acodec': 'mp3', + 'format_id': 'https-mp3', + }) + elif ext == 'm3u8' or 'format=m3u8' in media_url: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + # very likely no longer needed. Cannot find any url that uses it. + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mp4': + bitrate = int_or_none(xpath_text(relinker, './bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': join_nonempty('https', bitrate, delim='-'), + }) + else: + raise ExtractorError('Unrecognized media file found') - if not formats and geoprotection is True: + if (not formats and geoprotection is True) or '/video_no_available.mp4' in media_url: self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) - if not audio_only: - formats.extend(self._create_http_urls(relinker_url, formats)) + if not audio_only and not is_live: + formats.extend(self._create_http_urls(media_url, relinker_url, formats)) return filter_dict({ 'is_live': is_live, @@ -118,38 +99,31 @@ def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): 'formats': formats, }) - def _create_http_urls(self, relinker_url, fmts): - _RELINKER_REG = r'https?://(?P[^/]+?)/(?:i/)?(?P[^/]+?)/(?P.+?)/(?P\w+)(?:_(?P[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + def _create_http_urls(self, manifest_url, relinker_url, fmts): + _MANIFEST_REG = r'/(?P\w+)(?:_(?P[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h - '250': [352, 198], - '400': [512, 288], - '700': [512, 288], - '800': [700, 394], - '1200': [736, 414], - '1800': [1024, 576], - '2400': [1280, 720], - '3200': [1440, 810], - '3600': [1440, 810], - '5000': [1920, 1080], - '10000': [1920, 1080], + 250: [352, 198], + 400: [512, 288], + 600: [512, 288], + 700: [512, 288], + 800: [700, 394], + 1200: [736, 414], + 1500: [920, 518], + 1800: [1024, 576], + 2400: [1280, 720], + 3200: [1440, 810], + 3600: [1440, 810], + 5000: [1920, 1080], + 10000: [1920, 1080], } - def test_url(url): - resp = self._request_webpage( - HEADRequest(url), None, headers={'User-Agent': 'Rai'}, - fatal=False, errnote=False, note=False) - - if resp is False: + def percentage(number, target, pc=20, roof=125): + '''check if the target is in the range of number +/- percent''' + if not number or number < 0: return False - - if resp.code == 200: - return False if resp.url == url else resp.url - return None - - # filter out audio-only formats - fmts = [f for f in fmts if not f.get('vcodec') == 'none'] + return abs(target - number) < min(float(number) * float(pc) / 100.0, roof) def get_format_info(tbr): import math @@ -157,67 +131,78 @@ def get_format_info(tbr): if len(fmts) == 1 and not br: br = fmts[0].get('tbr') if br and br > 300: - tbr = compat_str(math.floor(br / 100) * 100) + tbr = math.floor(br / 100) * 100 else: - tbr = '250' + tbr = 250 # try extracting info from available m3u8 formats - format_copy = None + format_copy = [None, None] for f in fmts: if f.get('tbr'): - br_limit = math.floor(br / 100) - if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1: - format_copy = f.copy() + if percentage(tbr, f['tbr']): + format_copy[0] = f.copy() + if [f.get('width'), f.get('height')] == _QUALITY.get(tbr): + format_copy[1] = f.copy() + format_copy[1]['tbr'] = tbr + + # prefer format with similar bitrate because there might be + # multiple video with the same resolution but different bitrate + format_copy = format_copy[0] or format_copy[1] or {} return { + 'format_id': f'https-{tbr}', 'width': format_copy.get('width'), 'height': format_copy.get('height'), 'tbr': format_copy.get('tbr'), 'vcodec': format_copy.get('vcodec'), 'acodec': format_copy.get('acodec'), 'fps': format_copy.get('fps'), - 'format_id': f'https-{tbr}', } if format_copy else { + 'format_id': f'https-{tbr}', 'width': _QUALITY[tbr][0], 'height': _QUALITY[tbr][1], - 'format_id': f'https-{tbr}', - 'tbr': int(tbr), + 'tbr': tbr, + 'vcodec': 'avc1', + 'acodec': 'mp4a', + 'fps': 25, } - loc = test_url(_MP4_TMPL % (relinker_url, '*')) - if not isinstance(loc, compat_str): - return [] + # filter out single-stream formats + fmts = [f for f in fmts + if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none'] - mobj = re.match( - _RELINKER_REG, - test_url(relinker_url) or '') + mobj = re.search(_MANIFEST_REG, manifest_url) if not mobj: return [] - available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] - available_qualities = [i for i in available_qualities if i] formats = [] - for q in available_qualities: - fmt = { + for q in filter(None, available_qualities): + self.write_debug(f'Creating https format for quality {q}') + formats.append({ 'url': _MP4_TMPL % (relinker_url, q), 'protocol': 'https', 'ext': 'mp4', **get_format_info(q) - } - formats.append(fmt) + }) return formats + @staticmethod + def _get_thumbnails_list(thumbs, url): + return [{ + 'url': urljoin(url, thumb_url), + } for thumb_url in (thumbs or {}).values() if thumb_url] + @staticmethod def _extract_subtitles(url, video_data): STL_EXT = 'stl' SRT_EXT = 'srt' subtitles = {} - subtitles_array = video_data.get('subtitlesArray') or [] + subtitles_array = video_data.get('subtitlesArray') or video_data.get('subtitleList') or [] for k in ('subtitles', 'subtitlesUrl'): subtitles_array.append({'url': video_data.get(k)}) for subtitle in subtitles_array: sub_url = subtitle.get('url') - if sub_url and isinstance(sub_url, compat_str): + if sub_url and isinstance(sub_url, str): sub_lang = subtitle.get('language') or 'it' sub_url = urljoin(url, sub_url) sub_ext = determine_ext(sub_url, SRT_EXT) @@ -236,7 +221,7 @@ def _extract_subtitles(url, video_data): class RaiPlayIE(RaiBaseIE): _VALID_URL = rf'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P{RaiBaseIE._UUID_RE}))\.(?:html|json)' _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'url': 'https://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', @@ -244,22 +229,20 @@ class RaiPlayIE(RaiBaseIE): 'title': 'Report del 07/04/2014', 'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014', 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai Gulp', + 'thumbnail': r're:^https?://www\.raiplay\.it/.+\.jpg', + 'uploader': 'Rai 3', + 'creator': 'Rai 3', 'duration': 6160, 'series': 'Report', 'season': '2013/14', - 'subtitles': { - 'it': 'count:4', - }, + 'subtitles': {'it': 'count:4'}, 'release_year': 2022, 'episode': 'Espresso nel caffè - 07/04/2014', 'timestamp': 1396919880, 'upload_date': '20140408', + 'formats': 'count:4', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }, { # 1080p direct mp4 url 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', @@ -270,8 +253,9 @@ class RaiPlayIE(RaiBaseIE): 'title': 'Blanca - S1E1 - Senza occhi', 'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi', 'description': 'md5:75f95d5c030ec8bac263b1212322e28c', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 1', + 'thumbnail': r're:^https://www\.raiplay\.it/dl/img/.+\.jpg', + 'uploader': 'Rai Premium', + 'creator': 'Rai Fiction', 'duration': 6493, 'series': 'Blanca', 'season': 'Season 1', @@ -281,6 +265,30 @@ class RaiPlayIE(RaiBaseIE): 'episode': 'Senza occhi', 'timestamp': 1637318940, 'upload_date': '20211119', + 'formats': 'count:12', + }, + 'params': {'skip_download': True}, + 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] + }, { + # 1500 quality + 'url': 'https://www.raiplay.it/video/2012/09/S1E11---Tutto-cio-che-luccica-0cab3323-732e-45d6-8e86-7704acab6598.html', + 'md5': 'a634d20e8ab2d43724c273563f6bf87a', + 'info_dict': { + 'id': '0cab3323-732e-45d6-8e86-7704acab6598', + 'ext': 'mp4', + 'title': 'Mia and Me - S1E11 - Tutto ciò che luccica', + 'alt_title': 'St 1 Ep 11 - Mia and Me - Tutto ciò che luccica', + 'description': 'md5:4969e594184b1920c4c1f2b704da9dea', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai Gulp', + 'series': 'Mia and Me', + 'season': 'Season 1', + 'episode_number': 11, + 'release_year': 2015, + 'season_number': 1, + 'episode': 'Tutto ciò che luccica', + 'timestamp': 1348495020, + 'upload_date': '20120924', }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', @@ -299,57 +307,40 @@ def _real_extract(self, url): base, video_id = self._match_valid_url(url).groups() media = self._download_json( - base + '.json', video_id, 'Downloading video JSON') + f'{base}.json', video_id, 'Downloading video JSON') if not self.get_param('allow_unplayable_formats'): - if try_get( - media, - (lambda x: x['rights_management']['rights']['drm'], - lambda x: x['program_info']['rights_management']['rights']['drm']), - dict): + if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')): self.report_drm(video_id) - title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - - thumbnails = [] - for _, value in media.get('images', {}).items(): - if value: - thumbnails.append({ - 'url': urljoin(url, value), - }) - - date_published = media.get('date_published') - time_published = media.get('time_published') - if date_published and time_published: - date_published += ' ' + time_published - - subtitles = self._extract_subtitles(url, video) - - program_info = media.get('program_info') or {} + date_published = join_nonempty( + media.get('date_published'), media.get('time_published'), delim=' ') season = media.get('season') - alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') return { 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, 'display_id': video_id, - 'title': title, + 'title': media.get('name'), 'alt_title': strip_or_none(alt_title or None), 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel') or None), - 'creator': strip_or_none(media.get('editor') or None), + 'uploader': strip_or_none( + traverse_obj(media, ('program_info', 'channel')) + or media.get('channel') or None), + 'creator': strip_or_none( + traverse_obj(media, ('program_info', 'editor')) + or media.get('editor') or None), 'duration': parse_duration(video.get('duration')), 'timestamp': unified_timestamp(date_published), - 'thumbnails': thumbnails, - 'series': program_info.get('name'), + 'thumbnails': self._get_thumbnails_list(media.get('images'), url), + 'series': traverse_obj(media, ('program_info', 'name')), 'season_number': int_or_none(season), 'season': season if (season and not season.isdigit()) else None, 'episode': media.get('episode_title'), 'episode_number': int_or_none(media.get('episode')), - 'subtitles': subtitles, + 'subtitles': self._extract_subtitles(url, video), 'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))), **relinker_info } @@ -371,38 +362,39 @@ class RaiPlayLiveIE(RaiPlayIE): # XXX: Do not subclass from concrete IE 'live_status': 'is_live', 'upload_date': '20090502', 'timestamp': 1241276220, + 'formats': 'count:3', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }] class RaiPlayPlaylistIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+))(?:/(?P[^?#&]+))?' _TESTS = [{ + # entire series episodes + extras... 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, - 'playlist_mincount': 12, + 'playlist_mincount': 30, }, { + # single season 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo - Stagione 2', 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, - 'playlist_mincount': 12, + 'playlist_count': 12, }] def _real_extract(self, url): base, playlist_id, extra_id = self._match_valid_url(url).groups() program = self._download_json( - base + '.json', playlist_id, 'Downloading program JSON') + f'{base}.json', playlist_id, 'Downloading program JSON') if extra_id: extra_id = extra_id.upper().rstrip('/') @@ -450,7 +442,7 @@ class RaiPlaySoundIE(RaiBaseIE): 'title': 'Il Ruggito del Coniglio del 10/12/2021', 'alt_title': 'md5:0e6476cd57858bb0f3fcc835d305b455', 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.+\.jpg$', 'uploader': 'rai radio 2', 'duration': 5685, 'series': 'Il Ruggito del Coniglio', @@ -459,9 +451,7 @@ class RaiPlaySoundIE(RaiBaseIE): 'timestamp': 1638346620, 'upload_date': '20211201', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): @@ -480,9 +470,6 @@ def _real_extract(self, url): lambda x: x['live']['create_date'])) podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {} - thumbnails = [{ - 'url': urljoin(url, thumb_url), - } for thumb_url in (podcast_info.get('images') or {}).values() if thumb_url] return { **info, @@ -494,7 +481,7 @@ def _real_extract(self, url): 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none), 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none), 'timestamp': unified_timestamp(date_published), - 'thumbnails': thumbnails, + 'thumbnails': self._get_thumbnails_list(podcast_info.get('images'), url), 'series': podcast_info.get('title'), 'season_number': int_or_none(media.get('season')), 'episode': media.get('episode_title'), @@ -512,30 +499,30 @@ class RaiPlaySoundLiveIE(RaiPlaySoundIE): # XXX: Do not subclass from concrete 'display_id': 'radio2', 'ext': 'mp4', 'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+', - 'thumbnail': r're:https://www.raiplaysound.it/dl/img/.+?png', + 'thumbnail': r're:^https://www\.raiplaysound\.it/dl/img/.+\.png', 'uploader': 'rai radio 2', 'series': 'Rai Radio 2', 'creator': 'raiplaysound', 'is_live': True, 'live_status': 'is_live', }, - 'params': { - 'skip_download': 'live', - }, + 'params': {'skip_download': True}, }] class RaiPlaySoundPlaylistIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P[^/?#&]+))(?:/(?P[^?#&]+))?' _TESTS = [{ + # entire show 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio', 'info_dict': { 'id': 'ilruggitodelconiglio', 'title': 'Il Ruggito del Coniglio', - 'description': 'md5:1bbaf631245a7ab1ec4d9fbb3c7aa8f3', + 'description': 'md5:48cff6972435964284614d70474132e6', }, 'playlist_mincount': 65, }, { + # single season 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995', 'info_dict': { 'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995', @@ -568,22 +555,19 @@ def _real_extract(self, url): class RaiIE(RaiBaseIE): _VALID_URL = rf'https?://[^/]+\.(?:rai\.(?:it|tv))/.+?-(?P{RaiBaseIE._UUID_RE})(?:-.+?)?\.html' _TESTS = [{ - # var uniquename = "ContentItem-..." - # data-id="ContentItem-..." 'url': 'https://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'mp4', 'title': 'TG PRIMO TEMPO', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 1758, 'upload_date': '20140612', }, - 'skip': 'This content is available only in Italy', + 'params': {'skip_download': True}, + 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] }, { - # with ContentItem in og:url 'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '06345bd97c932f19ffb129973d07a020', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -592,123 +576,51 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2214, 'upload_date': '20161103' - } + }, + 'params': {'skip_download': True}, }, { - # Direct MMS URL + # Direct MMS: Media URL no longer works. 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', 'only_matching': True, }] - def _extract_from_content_id(self, content_id, url): + def _real_extract(self, url): + content_id = self._match_id(url) media = self._download_json( f'https://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-{content_id}.html?json', - content_id, 'Downloading video JSON') + content_id, 'Downloading video JSON', fatal=False, expected_status=404) - title = media['name'].strip() + if media is None: + return None - media_type = media['type'] - if 'Audio' in media_type: + if 'Audio' in media['type']: relinker_info = { 'formats': [{ - 'format_id': media.get('formatoAudio'), + 'format_id': join_nonempty('https', media.get('formatoAudio'), delim='-'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), + 'vcodec': 'none', + 'acodec': media.get('formatoAudio'), }] } - elif 'Video' in media_type: + elif 'Video' in media['type']: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) else: raise ExtractorError('not a media file') - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': compat_urlparse.urljoin(url, thumbnail_url), - }) - - subtitles = self._extract_subtitles(url, media) + thumbnails = self._get_thumbnails_list( + {image_type: media.get(image_type) for image_type in ( + 'image', 'image_medium', 'image_300')}, url) return { 'id': content_id, - 'title': title, - 'description': strip_or_none(media.get('desc') or None), + 'title': strip_or_none(media.get('name') or media.get('title')), + 'description': strip_or_none(media.get('desc')) or None, 'thumbnails': thumbnails, - 'uploader': strip_or_none(media.get('author') or None), + 'uploader': strip_or_none(media.get('author')) or None, 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), - 'subtitles': subtitles, - **relinker_info - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - content_item_id = None - - content_item_url = self._html_search_meta( - ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', - 'twitter:player', 'jsonlink'), webpage, default=None) - if content_item_url: - content_item_id = self._search_regex( - rf'ContentItem-({self._UUID_RE})', content_item_url, - 'content item id', default=None) - - if not content_item_id: - content_item_id = self._search_regex( - rf'''(?x) - (?: - (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)=| - ]+\bsrc= - ) - (["\']) - (?:(?!\1).)*\bContentItem-(?P{self._UUID_RE}) - ''', - webpage, 'content item id', default=None, group='id') - - content_item_ids = set() - if content_item_id: - content_item_ids.add(content_item_id) - if video_id not in content_item_ids: - content_item_ids.add(video_id) - - for content_item_id in content_item_ids: - try: - return self._extract_from_content_id(content_item_id, url) - except GeoRestrictedError: - raise - except ExtractorError: - pass - - relinker_url = self._proto_relative_url(self._search_regex( - r'''(?x) - (?: - var\s+videoURL| - mediaInfo\.mediaUri - )\s*=\s* - ([\'"]) - (?P - (?:https?:)? - //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? - (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 - ''', - webpage, 'relinker URL', group='url')) - - relinker_info = self._extract_relinker_info( - urljoin(url, relinker_url), video_id) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P[^\'"]+)\1', - webpage, 'title', group='title', - default=None) or self._og_search_title(webpage) - - return { - 'id': video_id, - 'title': title, + 'subtitles': self._extract_subtitles(url, media), **relinker_info } @@ -726,7 +638,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE 'duration': 1589, 'upload_date': '20220529', 'uploader': 'rainews', - } + }, + 'params': {'skip_download': True}, }, { # old content with fallback method to extract media urls 'url': 'https://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -739,12 +652,14 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE 'duration': 833, 'upload_date': '20161103' }, + 'params': {'skip_download': True}, 'expected_warnings': ['unable to extract player_data'], }, { # iframe + drm 'url': 'https://www.rainews.it/iframe/video/2022/07/euro2022-europei-calcio-femminile-italia-belgio-gol-0-1-video-4de06a69-de75-4e32-a657-02f0885f8118.html', 'only_matching': True, }] + _PLAYER_TAG = 'news' def _real_extract(self, url): video_id = self._match_id(url) @@ -752,8 +667,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) player_data = self._search_json( - r'<rainews-player\s*data=\'', webpage, 'player_data', video_id, - transform_source=clean_html, fatal=False) + rf'<rai{self._PLAYER_TAG}-player\s*data=\'', webpage, 'player_data', video_id, + transform_source=clean_html, default={}) track_info = player_data.get('track_info') relinker_url = traverse_obj(player_data, 'mediapolis', 'content_url') @@ -770,16 +685,36 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': track_info.get('title') or self._og_search_title(webpage), + 'title': player_data.get('title') or track_info.get('title') or self._og_search_title(webpage), 'upload_date': unified_strdate(track_info.get('date')), 'uploader': strip_or_none(track_info.get('editor') or None), **relinker_info } +class RaiCulturaIE(RaiNewsIE): # XXX: Do not subclass from concrete IE + _VALID_URL = rf'https?://(www\.)?raicultura\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' + _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] + _TESTS = [{ + 'url': 'https://www.raicultura.it/letteratura/articoli/2018/12/Alberto-Asor-Rosa-Letteratura-e-potere-05ba8775-82b5-45c5-a89d-dd955fbde1fb.html', + 'info_dict': { + 'id': '05ba8775-82b5-45c5-a89d-dd955fbde1fb', + 'ext': 'mp4', + 'title': 'Alberto Asor Rosa: Letteratura e potere', + 'duration': 1756, + 'upload_date': '20181206', + 'uploader': 'raicultura', + 'formats': 'count:2', + }, + 'params': {'skip_download': True}, + }] + _PLAYER_TAG = 'cultura' + + class RaiSudtirolIE(RaiBaseIE): - _VALID_URL = r'https?://raisudtirol\.rai\.it/.+?media=(?P<id>[TP]tv\d+)' + _VALID_URL = r'https?://raisudtirol\.rai\.it/.+media=(?P<id>\w+)' _TESTS = [{ + # mp4 file 'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460', 'info_dict': { 'id': 'Ptv1619729460', @@ -787,34 +722,62 @@ class RaiSudtirolIE(RaiBaseIE): 'title': 'Euro: trasmisciun d\'economia - 29-04-2021 20:51', 'series': 'Euro: trasmisciun d\'economia', 'upload_date': '20210429', - 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+?\.jpg', + 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+\.jpg', 'uploader': 'raisudtirol', - } + 'formats': 'count:1', + }, + 'params': {'skip_download': True}, + }, { + # m3u manifest + 'url': 'https://raisudtirol.rai.it/it/kidsplayer.php?lang=it&media=GUGGUG_P1.smil', + 'info_dict': { + 'id': 'GUGGUG_P1', + 'ext': 'mp4', + 'title': 'GUGGUG! La Prospettiva - Die Perspektive', + 'uploader': 'raisudtirol', + 'formats': 'count:6', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_date = self._html_search_regex(r'<span class="med_data">(.+?)</span>', webpage, 'video_date', fatal=False) - video_title = self._html_search_regex(r'<span class="med_title">(.+?)</span>', webpage, 'video_title', fatal=False) - video_url = self._html_search_regex(r'sources:\s*\[\{file:\s*"(.+?)"\}\]', webpage, 'video_url') - video_thumb = self._html_search_regex(r'image: \'(.+?)\'', webpage, 'video_thumb', fatal=False) + video_date = self._html_search_regex( + r'<span class="med_data">(.+?)</span>', webpage, 'video_date', default=None) + video_title = self._html_search_regex([ + r'<span class="med_title">(.+?)</span>', r'title: \'(.+?)\','], + webpage, 'video_title', default=None) + video_url = self._html_search_regex([ + r'sources:\s*\[\{file:\s*"(.+?)"\}\]', + r'<source\s+src="(.+?)"\s+type="application/x-mpegURL"'], + webpage, 'video_url', default=None) + + ext = determine_ext(video_url) + if ext == 'm3u8': + formats = self._extract_m3u8_formats(video_url, video_id) + elif ext == 'mp4': + formats = [{ + 'format_id': 'https-mp4', + 'url': self._proto_relative_url(video_url), + 'width': 1024, + 'height': 576, + 'fps': 25, + 'vcodec': 'avc1', + 'acodec': 'mp4a', + }] + else: + formats = [] + self.raise_no_formats(f'Unrecognized media file: {video_url}') return { 'id': video_id, 'title': join_nonempty(video_title, video_date, delim=' - '), - 'series': video_title, + 'series': video_title if video_date else None, 'upload_date': unified_strdate(video_date), - 'thumbnail': urljoin('https://raisudtirol.rai.it/', video_thumb), + 'thumbnail': urljoin('https://raisudtirol.rai.it/', self._html_search_regex( + r'image: \'(.+?)\'', webpage, 'video_thumb', default=None)), 'uploader': 'raisudtirol', - 'formats': [{ - 'format_id': 'https-mp4', - 'url': self._proto_relative_url(video_url), - 'width': 1024, - 'height': 576, - 'fps': 25, - 'vcodec': 'h264', - 'acodec': 'aac', - }], + 'formats': formats, } diff --git a/plugin/yt-dlp/yt_dlp/extractor/recurbate.py b/plugin/yt-dlp/yt_dlp/extractor/recurbate.py new file mode 100644 index 0000000..5534cf3 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/recurbate.py @@ -0,0 +1,43 @@ +import urllib.error + +from .common import InfoExtractor +from ..utils import ExtractorError, merge_dicts + + +class RecurbateIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?recurbate\.com/play\.php\?video=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://recurbate.com/play.php?video=39161415', + 'md5': 'dd2b4ec57aa3e3572cb5cf0997fca99f', + 'info_dict': { + 'id': '39161415', + 'ext': 'mp4', + 'description': 'md5:db48d09e4d93fc715f47fd3d6b7edd51', + 'title': 'Performer zsnicole33 show on 2022-10-25 20:23, Chaturbate Archive – Recurbate', + 'age_limit': 18, + }, + 'skip': 'Website require membership.', + }] + + def _real_extract(self, url): + SUBSCRIPTION_MISSING_MESSAGE = 'This video is only available for registered users; Set your authenticated browser user agent via the --user-agent parameter.' + video_id = self._match_id(url) + try: + webpage = self._download_webpage(url, video_id) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') + raise + token = self._html_search_regex(r'data-token="([^"]+)"', webpage, 'token') + video_url = f'https://recurbate.com/api/get.php?video={video_id}&token={token}' + + video_webpage = self._download_webpage(video_url, video_id) + if video_webpage == 'shall_subscribe': + self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') + entries = self._parse_html5_media_entries(video_url, video_webpage, video_id) + return merge_dicts({ + 'id': video_id, + 'title': self._html_extract_title(webpage, 'title'), + 'description': self._og_search_description(webpage), + 'age_limit': self._rta_search(webpage), + }, entries[0]) diff --git a/plugin/yt-dlp/yt_dlp/extractor/rottentomatoes.py b/plugin/yt-dlp/yt_dlp/extractor/rottentomatoes.py index 056a078..039fe82 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/rottentomatoes.py +++ b/plugin/yt-dlp/yt_dlp/extractor/rottentomatoes.py @@ -1,30 +1,80 @@ from .common import InfoExtractor -from .internetvideoarchive import InternetVideoArchiveIE +from ..utils import ( + ExtractorError, + clean_html, + float_or_none, + get_element_by_class, + join_nonempty, + traverse_obj, + url_or_none, +) class RottenTomatoesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/(?P<playlist>[^/]+)(?:/(?P<tr>trailers)(?:/(?P<id>\w+))?)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { 'id': '11028566', 'ext': 'mp4', 'title': 'Toy Story 3', - 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.' + }, + 'skip': 'No longer available', + }, { + 'url': 'https://www.rottentomatoes.com/m/toy_story_3/trailers/VycaVoBKhGuk', + 'info_dict': { + 'id': 'VycaVoBKhGuk', + 'ext': 'mp4', + 'title': 'Toy Story 3: Trailer 2', + 'description': '', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 149.941 + }, + }, { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3', + 'info_dict': { + 'id': 'toy_story_3', + 'title': 'Toy Story 3', + }, + 'playlist_mincount': 4, + }, { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers', + 'info_dict': { + 'id': 'toy_story_3-trailers', }, - } + 'playlist_mincount': 5, + }] + + def _extract_videos(self, data, display_id): + for video in traverse_obj(data, (lambda _, v: v['publicId'] and v['file'] and v['type'] == 'hls')): + yield { + 'formats': self._extract_m3u8_formats( + video['file'], display_id, 'mp4', m3u8_id='hls', fatal=False), + **traverse_obj(video, { + 'id': 'publicId', + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {float_or_none}), + 'thumbnail': ('image', {url_or_none}), + }), + } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') + playlist_id, trailers, video_id = self._match_valid_url(url).group('playlist', 'tr', 'id') + playlist_id = join_nonempty(playlist_id, trailers) + webpage = self._download_webpage(url, playlist_id) + data = self._search_json( + r'<script[^>]+\bid=["\'](?:heroV|v)ideos["\'][^>]*>', webpage, + 'data', playlist_id, contains_pattern=r'\[{(?s:.+)}\]') + + if video_id: + video_data = traverse_obj(data, lambda _, v: v['publicId'] == video_id) + if not video_data: + raise ExtractorError('Unable to extract video from webpage') + return next(self._extract_videos(video_data, video_id)) - return { - '_type': 'url_transparent', - 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, - 'ie_key': InternetVideoArchiveIE.ie_key(), - 'id': video_id, - 'title': self._og_search_title(webpage), - } + return self.playlist_result( + self._extract_videos(data, playlist_id), playlist_id, + clean_html(get_element_by_class('scoreboard__title', webpage))) diff --git a/plugin/yt-dlp/yt_dlp/extractor/rozhlas.py b/plugin/yt-dlp/yt_dlp/extractor/rozhlas.py index 6222c40..ff6cd4e 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/rozhlas.py +++ b/plugin/yt-dlp/yt_dlp/extractor/rozhlas.py @@ -1,10 +1,15 @@ +import itertools +import urllib.error + from .common import InfoExtractor from ..utils import ( + ExtractorError, extract_attributes, int_or_none, remove_start, str_or_none, traverse_obj, + unified_timestamp, url_or_none, ) @@ -51,7 +56,40 @@ def _real_extract(self, url): } -class RozhlasVltavaIE(InfoExtractor): +class RozhlasBaseIE(InfoExtractor): + def _extract_formats(self, entry, audio_id): + formats = [] + for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): + ext = audio.get('variant') + for retry in self.RetryManager(): + if retry.attempt > 1: + self._sleep(1, audio_id) + try: + if ext == 'dash': + formats.extend(self._extract_mpd_formats( + audio['url'], audio_id, mpd_id=ext)) + elif ext == 'hls': + formats.extend(self._extract_m3u8_formats( + audio['url'], audio_id, 'm4a', m3u8_id=ext)) + else: + formats.append({ + 'url': audio['url'], + 'ext': ext, + 'format_id': ext, + 'abr': int_or_none(audio.get('bitrate')), + 'acodec': ext, + 'vcodec': 'none', + }) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 429: + retry.error = e.cause + else: + self.report_warning(e.msg) + + return formats + + +class RozhlasVltavaIE(RozhlasBaseIE): _VALID_URL = r'https?://(?:\w+\.rozhlas|english\.radio)\.cz/[\w-]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://wave.rozhlas.cz/papej-masicko-porcujeme-a-bilancujeme-filmy-a-serialy-ktere-letos-zabily-8891337', @@ -168,33 +206,14 @@ class RozhlasVltavaIE(InfoExtractor): }] def _extract_video(self, entry): - formats = [] audio_id = entry['meta']['ga']['contentId'] - for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): - ext = audio.get('variant') - if ext == 'dash': - formats.extend(self._extract_mpd_formats( - audio['url'], audio_id, mpd_id=ext, fatal=False)) - elif ext == 'hls': - formats.extend(self._extract_m3u8_formats( - audio['url'], audio_id, 'm4a', m3u8_id=ext, fatal=False)) - else: - formats.append({ - 'url': audio['url'], - 'ext': ext, - 'format_id': ext, - 'abr': int_or_none(audio.get('bitrate')), - 'acodec': ext, - 'vcodec': 'none', - }) - chapter_number = traverse_obj(entry, ('meta', 'ga', 'contentSerialPart', {int_or_none})) return { 'id': audio_id, 'chapter': traverse_obj(entry, ('meta', 'ga', 'contentNameShort')) if chapter_number else None, 'chapter_number': chapter_number, - 'formats': formats, + 'formats': self._extract_formats(entry, audio_id), **traverse_obj(entry, { 'title': ('meta', 'ga', 'contentName'), 'description': 'title', @@ -219,3 +238,106 @@ def _real_extract(self, url): 'title': traverse_obj(data, ('series', 'title')), 'entries': map(self._extract_video, data['playlist']), } + + +class MujRozhlasIE(RozhlasBaseIE): + _VALID_URL = r'https?://(?:www\.)?mujrozhlas\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + # single episode extraction + 'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli', + 'md5': '6f8fd68663e64936623e67c152a669e0', + 'info_dict': { + 'id': '10739193', + 'ext': 'mp3', + 'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli', + 'description': 'md5:db7141e9caaedc9041ec7cefb9a62908', + 'timestamp': 1684915200, + 'modified_timestamp': 1684922446, + 'series': 'Vykopávky', + 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg', + 'channel_id': 'radio-wave', + 'upload_date': '20230524', + 'modified_date': '20230524', + }, + }, { + # serial extraction + 'url': 'https://www.mujrozhlas.cz/radiokniha/jaroslava-janackova-pribeh-tajemneho-psani-o-pramenech-genezi-babicky', + 'playlist_mincount': 7, + 'info_dict': { + 'id': 'bb2b5f4e-ffb4-35a6-a34a-046aa62d6f6b', + 'title': 'Jaroslava Janáčková: Příběh tajemného psaní. O pramenech a genezi Babičky', + 'description': 'md5:7434d8fac39ac9fee6df098e11dfb1be', + }, + }, { + # show extraction + 'url': 'https://www.mujrozhlas.cz/nespavci', + 'playlist_mincount': 14, + 'info_dict': { + 'id': '09db9b37-d0f4-368c-986a-d3439f741f08', + 'title': 'Nespavci', + 'description': 'md5:c430adcbf9e2b9eac88b745881e814dc', + }, + }] + + def _call_api(self, path, item_id, msg='API JSON'): + return self._download_json( + f'https://api.mujrozhlas.cz/{path}/{item_id}', item_id, + note=f'Downloading {msg}', errnote=f'Failed to download {msg}')['data'] + + def _extract_audio_entry(self, entry): + audio_id = entry['meta']['ga']['contentId'] + + return { + 'id': audio_id, + 'formats': self._extract_formats(entry['attributes'], audio_id), + **traverse_obj(entry, { + 'title': ('attributes', 'title'), + 'description': ('attributes', 'description'), + 'episode_number': ('attributes', 'part'), + 'series': ('attributes', 'mirroredShow', 'title'), + 'chapter': ('attributes', 'mirroredSerial', 'title'), + 'artist': ('meta', 'ga', 'contentAuthor'), + 'channel_id': ('meta', 'ga', 'contentCreator'), + 'timestamp': ('attributes', 'since', {unified_timestamp}), + 'modified_timestamp': ('attributes', 'updated', {unified_timestamp}), + 'thumbnail': ('attributes', 'asset', 'url', {url_or_none}), + }) + } + + def _entries(self, api_url, playlist_id): + for page in itertools.count(1): + episodes = self._download_json( + api_url, playlist_id, note=f'Downloading episodes page {page}', + errnote=f'Failed to download episodes page {page}', fatal=False) + for episode in traverse_obj(episodes, ('data', lambda _, v: v['meta']['ga']['contentId'])): + yield self._extract_audio_entry(episode) + api_url = traverse_obj(episodes, ('links', 'next', {url_or_none})) + if not api_url: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + info = self._search_json(r'\bvar\s+dl\s*=', webpage, 'info json', display_id) + + entity = info['siteEntityBundle'] + + if entity == 'episode': + return self._extract_audio_entry(self._call_api( + 'episodes', info['contentId'], 'episode info API JSON')) + + elif entity in ('show', 'serial'): + playlist_id = info['contentShow'].split(':')[0] if entity == 'show' else info['contentId'] + data = self._call_api(f'{entity}s', playlist_id, f'{entity} playlist JSON') + api_url = data['relationships']['episodes']['links']['related'] + return self.playlist_result( + self._entries(api_url, playlist_id), playlist_id, + **traverse_obj(data, ('attributes', { + 'title': 'title', + 'description': 'description', + }))) + + else: + # `entity == 'person'` not implemented yet by API, ref: + # https://api.mujrozhlas.cz/persons/8367e456-2a57-379a-91bb-e699619bea49/participation + raise ExtractorError(f'Unsupported entity type "{entity}"') diff --git a/plugin/yt-dlp/yt_dlp/extractor/rumble.py b/plugin/yt-dlp/yt_dlp/extractor/rumble.py index 85f176b..d7ab386 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/rumble.py +++ b/plugin/yt-dlp/yt_dlp/extractor/rumble.py @@ -144,7 +144,7 @@ def _extract_embed_urls(cls, url, webpage): if embeds: return embeds return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( - r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] + r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{\s*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/plugin/yt-dlp/yt_dlp/extractor/shemaroome.py b/plugin/yt-dlp/yt_dlp/extractor/shemaroome.py index f866edf..1d06b3f 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/shemaroome.py +++ b/plugin/yt-dlp/yt_dlp/extractor/shemaroome.py @@ -73,7 +73,10 @@ def _real_extract(self, url): key = bytes_to_intlist(compat_b64decode(data_json['key'])) iv = [0] * 16 m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii') - formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']}) + headers = {'stream_key': data_json['stream_key']} + formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers=headers) + for fmt in formats: + fmt['http_headers'] = headers release_date = self._html_search_regex( (r'itemprop="uploadDate">\s*([\d-]+)', r'id="release_date" value="([\d-]+)'), diff --git a/plugin/yt-dlp/yt_dlp/extractor/sonyliv.py b/plugin/yt-dlp/yt_dlp/extractor/sonyliv.py index 9ccee4c..15758bd 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/sonyliv.py +++ b/plugin/yt-dlp/yt_dlp/extractor/sonyliv.py @@ -10,6 +10,8 @@ from ..utils import ( ExtractorError, int_or_none, + jwt_decode_hs256, + try_call, try_get, ) @@ -77,8 +79,10 @@ def _perform_login(self, username, password): self._HEADERS['device_id'] = self._get_device_id() self._HEADERS['content-type'] = 'application/json' - if username.lower() == 'token' and len(password) > 1198: + if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): self._HEADERS['authorization'] = password + self.report_login() + return elif len(username) != 10 or not username.isdigit(): raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}') diff --git a/plugin/yt-dlp/yt_dlp/extractor/stripchat.py b/plugin/yt-dlp/yt_dlp/extractor/stripchat.py index eb82e70..b4f93e1 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/stripchat.py +++ b/plugin/yt-dlp/yt_dlp/extractor/stripchat.py @@ -42,14 +42,13 @@ def _real_extract(self, url): elif not traverse_obj(data, ('viewCam', 'model', 'isLive'), expected_type=bool): raise UserNotLive(video_id=video_id) - server = traverse_obj(data, ('viewCam', 'viewServers', 'flashphoner-hls'), expected_type=str) model_id = traverse_obj(data, ('viewCam', 'model', 'id'), expected_type=int) formats = [] for host in traverse_obj(data, ('config', 'data', ( (('features', 'featuresV2'), 'hlsFallback', 'fallbackDomains', ...), 'hlsStreamHost'))): formats = self._extract_m3u8_formats( - f'https://b-{server}.{host}/hls/{model_id}/master/{model_id}_auto.m3u8', + f'https://edge-hls.{host}/hls/{model_id}/master/{model_id}_auto.m3u8', video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) if formats: break diff --git a/plugin/yt-dlp/yt_dlp/extractor/substack.py b/plugin/yt-dlp/yt_dlp/extractor/substack.py index ae8caae..f8e8b0b 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/substack.py +++ b/plugin/yt-dlp/yt_dlp/extractor/substack.py @@ -2,7 +2,7 @@ import urllib.parse from .common import InfoExtractor -from ..utils import str_or_none, traverse_obj +from ..utils import js_to_json, str_or_none, traverse_obj class SubstackIE(InfoExtractor): @@ -14,7 +14,7 @@ class SubstackIE(InfoExtractor): 'id': '47660949', 'ext': 'mp4', 'title': 'I MADE A VLOG', - 'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6', + 'description': 'md5:9248af9a759321e1027226f988f54d96', 'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18', 'uploader': 'Maybe Baby', 'uploader_id': '33628', @@ -77,7 +77,9 @@ def _real_extract(self, url): display_id, username = self._match_valid_url(url).group('id', 'username') webpage = self._download_webpage(url, display_id) - webpage_info = self._search_json(r'<script[^>]*>\s*window\._preloads\s*=', webpage, 'preloads', display_id) + webpage_info = self._parse_json(self._search_json( + r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string', + display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id) post_type = webpage_info['post']['type'] formats, subtitles = [], {} diff --git a/plugin/yt-dlp/yt_dlp/extractor/sverigesradio.py b/plugin/yt-dlp/yt_dlp/extractor/sverigesradio.py index 392e84c..7323c3f 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/sverigesradio.py +++ b/plugin/yt-dlp/yt_dlp/extractor/sverigesradio.py @@ -1,8 +1,13 @@ from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, + get_element_by_id, + get_element_html_by_class, int_or_none, str_or_none, + traverse_obj, + url_or_none, ) @@ -21,7 +26,15 @@ class SverigesRadioBaseIE(InfoExtractor): } def _real_extract(self, url): - audio_id = self._match_id(url) + audio_id, display_id = self._match_valid_url(url).group('id', 'slug') + if not audio_id: + webpage = self._download_webpage(url, display_id) + audio_id = ( + traverse_obj( + get_element_html_by_class('audio-button', webpage), + ({extract_attributes}, ('data-audio-id', 'data-publication-id')), get_all=False) + or self._parse_json(get_element_by_id('gtm-metadata', webpage), display_id)['pageId']) + query = { 'id': audio_id, 'type': self._AUDIO_TYPE, @@ -30,7 +43,6 @@ def _real_extract(self, url): item = self._download_json( self._BASE_URL + 'audiometadata', audio_id, 'Downloading audio JSON metadata', query=query)['items'][0] - title = item['subtitle'] query['format'] = 'iis' urls = [] @@ -61,18 +73,20 @@ def _real_extract(self, url): return { 'id': audio_id, - 'title': title, 'formats': formats, - 'series': item.get('title'), - 'duration': int_or_none(item.get('duration')), - 'thumbnail': item.get('displayimageurl'), - 'description': item.get('description'), + **traverse_obj(item, { + 'title': 'subtitle', + 'series': 'title', + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('displayimageurl', {url_or_none}), + 'description': 'description', + }), } class SverigesRadioPublicationIE(SverigesRadioBaseIE): IE_NAME = 'sverigesradio:publication' - _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*?\bartikel=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?(?:artikel|gruppsida)(?:\.aspx\?.*?\bartikel=(?P<id>[0-9]+)|/(?P<slug>[\w-]+))' _TESTS = [{ 'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546', 'md5': '6a4917e1923fccb080e5a206a5afa542', @@ -85,6 +99,18 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE): 'description': 'md5:daf7ce66a8f0a53d5465a5984d3839df', 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://sverigesradio.se/artikel/tysk-fotbollsfeber-bayern-munchens-10-ariga-segersvit-kan-brytas', + 'md5': 'f8a914ad50f491bb74eed403ab4bfef6', + 'info_dict': { + 'id': '8360345', + 'ext': 'm4a', + 'title': 'Tysk fotbollsfeber när Bayern Münchens 10-åriga segersvit kan brytas', + 'series': 'Radiosporten', + 'description': 'md5:5254610e20ce527ecb3a6102a06dcc5f', + 'duration': 72, + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887', 'only_matching': True, @@ -94,8 +120,8 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE): class SverigesRadioEpisodeIE(SverigesRadioBaseIE): IE_NAME = 'sverigesradio:episode' - _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?:(?P<id>\d+)|(?P<slug>[\w-]+))(?:$|[#?])' + _TESTS = [{ 'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300', 'md5': '20dc4d8db24228f846be390b0c59a07c', 'info_dict': { @@ -106,6 +132,18 @@ class SverigesRadioEpisodeIE(SverigesRadioBaseIE): 'title': 'Metoo och valen', 'description': 'md5:fcb5c1f667f00badcc702b196f10a27e', 'thumbnail': r're:^https?://.*\.jpg', - } - } + }, + }, { + 'url': 'https://sverigesradio.se/avsnitt/p4-live-med-first-aid-kit-scandinavium-mars-2023', + 'md5': 'ce17fb82520a8033dbb846993d5589fe', + 'info_dict': { + 'id': '2160416', + 'ext': 'm4a', + 'title': 'P4 Live med First Aid Kit', + 'description': 'md5:6d5b78eed3d2b65f6de04daa45e9285d', + 'thumbnail': r're:^https?://.*\.jpg', + 'series': 'P4 Live', + 'duration': 5640, + }, + }] _AUDIO_TYPE = 'episode' diff --git a/plugin/yt-dlp/yt_dlp/extractor/tagesschau.py b/plugin/yt-dlp/yt_dlp/extractor/tagesschau.py index b439889..63e5e84 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/tagesschau.py +++ b/plugin/yt-dlp/yt_dlp/extractor/tagesschau.py @@ -2,10 +2,12 @@ from .common import InfoExtractor from ..utils import ( - js_to_json, + UnsupportedError, extract_attributes, - try_get, int_or_none, + js_to_json, + parse_iso8601, + try_get, ) @@ -14,36 +16,38 @@ class TagesschauIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': '7a7287612fa881a1ae1d087df45c2fd6', + 'md5': 'ccb9359bf8c4795836e43759f3408a93', 'info_dict': { 'id': 'video-102143-1', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', + 'duration': 138, }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', - 'md5': '3c54c1f6243d279b706bde660ceec633', + 'md5': '5c15e8f3da049e48829ec9786d835536', 'info_dict': { 'id': 'ts-5727-1', 'ext': 'mp4', 'title': 'Ganze Sendung', + 'duration': 932, }, }, { # exclusive audio 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', - 'md5': '4cf22023c285f35e99c24d290ba58cc9', + 'md5': '4bff8f23504df56a0d86ed312d654182', 'info_dict': { 'id': 'audio-29417-1', 'ext': 'mp3', - 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', + 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', }, }, { 'url': 'http://www.tagesschau.de/inland/bnd-303.html', - 'md5': '12cfb212d9325b5ba0d52b625f1aa61c', + 'md5': 'f049fa1698d7564e9ca4c3325108f034', 'info_dict': { 'id': 'bnd-303-1', - 'ext': 'mp4', - 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa', + 'ext': 'mp3', + 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa', }, }, { 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', @@ -51,13 +55,24 @@ class TagesschauIE(InfoExtractor): 'id': 'afd-parteitag-135', 'title': 'AfD', }, - 'playlist_count': 20, + 'playlist_mincount': 15, }, { 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', 'info_dict': { 'id': 'audio-29417-1', 'ext': 'mp3', - 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', + 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', + }, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html', + 'info_dict': { + 'id': 'podcast-11km-327', + 'ext': 'mp3', + 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen', + 'upload_date': '20230322', + 'timestamp': 1679482808, + 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg', + 'description': 'md5:dad059931fe4b3693e3656e93a249848', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', @@ -117,7 +132,7 @@ def _real_extract(self, url): formats = [] if media_url.endswith('master.m3u8'): formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') - elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'): + elif media_url.endswith('.mp3'): formats = [{ 'url': media_url, 'vcodec': 'none', @@ -130,20 +145,19 @@ def _real_extract(self, url): 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), 'formats': formats }) + + if not entries: + raise UnsupportedError(url) + if len(entries) > 1: return self.playlist_result(entries, display_id, title) - formats = entries[0]['formats'] - video_info = self._search_json_ld(webpage, video_id) - description = video_info.get('description') - thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail') - timestamp = video_info.get('timestamp') - title = title or video_info.get('description') return { 'id': display_id, 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'timestamp': timestamp, - 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': entries[0]['formats'], + 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)), + 'description': self._og_search_description(webpage), + 'duration': entries[0]['duration'], } diff --git a/plugin/yt-dlp/yt_dlp/extractor/tencent.py b/plugin/yt-dlp/yt_dlp/extractor/tencent.py index 5a80b9a..f697cf8 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/tencent.py +++ b/plugin/yt-dlp/yt_dlp/extractor/tencent.py @@ -163,11 +163,9 @@ class VQQBaseIE(TencentBaseIE): _REFERER = 'v.qq.com' def _get_webpage_metadata(self, webpage, video_id): - return self._parse_json( - self._search_regex( - r'(?s)<script[^>]*>[^<]*window\.__pinia\s*=\s*([^<]+)</script>', - webpage, 'pinia data', fatal=False), - video_id, transform_source=js_to_json, fatal=False) + return self._search_json( + r'<script[^>]*>[^<]*window\.__(?:pinia|PINIA__)\s*=', + webpage, 'pinia data', video_id, transform_source=js_to_json, fatal=False) class VQQVideoIE(VQQBaseIE): @@ -176,7 +174,7 @@ class VQQVideoIE(VQQBaseIE): _TESTS = [{ 'url': 'https://v.qq.com/x/page/q326831cny0.html', - 'md5': '84568b3722e15e9cd023b5594558c4a7', + 'md5': 'b11c9cb781df710d686b950376676e2a', 'info_dict': { 'id': 'q326831cny0', 'ext': 'mp4', @@ -187,7 +185,7 @@ class VQQVideoIE(VQQBaseIE): }, }, { 'url': 'https://v.qq.com/x/page/o3013za7cse.html', - 'md5': 'cc431c4f9114a55643893c2c8ebf5592', + 'md5': 'a1bcf42c6d28c189bd2fe2d468abb287', 'info_dict': { 'id': 'o3013za7cse', 'ext': 'mp4', @@ -208,6 +206,7 @@ class VQQVideoIE(VQQBaseIE): 'series': '鸡毛飞上天', 'format_id': r're:^shd', }, + 'skip': '404', }, { 'url': 'https://v.qq.com/x/cover/mzc00200p29k31e/s0043cwsgj0.html', 'md5': 'fadd10bf88aec3420f06f19ee1d24c5b', @@ -220,6 +219,7 @@ class VQQVideoIE(VQQBaseIE): 'series': '青年理工工作者生活研究所', 'format_id': r're:^shd', }, + 'params': {'skip_download': 'm3u8'}, }, { # Geo-restricted to China 'url': 'https://v.qq.com/x/cover/mcv8hkc8zk8lnov/x0036x5qqsr.html', diff --git a/plugin/yt-dlp/yt_dlp/extractor/tiktok.py b/plugin/yt-dlp/yt_dlp/extractor/tiktok.py index ee50467..4da1431 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/tiktok.py +++ b/plugin/yt-dlp/yt_dlp/extractor/tiktok.py @@ -62,7 +62,7 @@ def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', + 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)', 'Accept': 'application/json', }, query=query) @@ -79,11 +79,11 @@ def _build_api_query(self, query, app_version, manifest_app_version): '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', - 'device_type': 'Pixel 4', + 'device_type': 'Pixel 7', 'device_platform': 'android', - 'resolution': '1080*1920', + 'resolution': '1080*2400', 'dpi': 420, - 'os_version': '10', + 'os_version': '13', 'os_api': '29', 'carrier_region': 'US', 'sys_region': 'US', @@ -218,8 +218,8 @@ def mp3_meta(url): def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) if res: - known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height')) - known_resolutions[res].setdefault('width', add_meta.get('width')) + known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height')) + known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width')) parsed_meta.update(known_resolutions.get(res, {})) add_meta.setdefault('height', int_or_none(res[:-1])) return [{ @@ -624,6 +624,32 @@ class TikTokIE(TikTokBaseIE): 'thumbnails': 'count:3', }, 'expected_warnings': ['Unable to find video in feed'], + }, { + # 1080p format + 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', + 'md5': '982512017a8a917124d5a08c8ae79621', + 'info_dict': { + 'id': '7107337212743830830', + 'ext': 'mp4', + 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok', + 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok', + 'uploader': 'tatemcrae', + 'uploader_id': '86328792343818240', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', + 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', + 'creator': 't8', + 'artist': 't8', + 'track': 'original sound', + 'upload_date': '20220609', + 'timestamp': 1654805899, + 'duration': 150, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'thumbnail': r're:^https://.+\.webp', + }, + 'params': {'format': 'bytevc1_1080p_808907-0'}, }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', diff --git a/plugin/yt-dlp/yt_dlp/extractor/tv4.py b/plugin/yt-dlp/yt_dlp/extractor/tv4.py index 3b28d8b..271f389 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/tv4.py +++ b/plugin/yt-dlp/yt_dlp/extractor/tv4.py @@ -2,8 +2,11 @@ from .common import InfoExtractor from ..utils import ( + bool_or_none, int_or_none, parse_iso8601, + traverse_obj, + url_or_none, ) @@ -20,19 +23,25 @@ class TV4IE(InfoExtractor): sport/| ) )(?P<id>[0-9]+)''' - _GEO_COUNTRIES = ['SE'] + _GEO_BYPASS = False _TESTS = [ { + # not geo-restricted 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '2491650', 'ext': 'mp4', 'title': 'Kalla Fakta 5 (english subtitles)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': int, + 'description': '2491650', + 'series': 'Kalla fakta', + 'duration': 1335, + 'thumbnail': r're:^https?://[^/?#]+/api/v2/img/', + 'timestamp': 1385373240, 'upload_date': '20131125', }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.tv4play.se/iframe/video/3054113', @@ -46,6 +55,7 @@ class TV4IE(InfoExtractor): 'timestamp': int, 'upload_date': '20150130', }, + 'skip': '404 Not Found', }, { 'url': 'http://www.tv4play.se/sport/3060959', @@ -69,29 +79,28 @@ class TV4IE(InfoExtractor): } ] + def _call_api(self, endpoint, video_id, headers=None, query={}): + return self._download_json( + f'https://playback2.a2d.tv/{endpoint}/{video_id}', video_id, + f'Downloading {endpoint} API JSON', headers=headers, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + **query, + }) + def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_json( - 'https://playback-api.b17g.net/asset/%s' % video_id, - video_id, 'Downloading video info JSON', query={ - 'service': 'tv4', - 'device': 'browser', - 'protocol': 'hls,dash', - 'drm': 'widevine', - })['metadata'] + info = traverse_obj(self._call_api('asset', video_id, query={ + 'protocol': 'hls,dash', + 'drm': 'widevine', + }), ('metadata', {dict})) or {} - title = info['title'] + manifest_url = self._call_api( + 'play', video_id, headers=self.geo_verification_headers())['playbackItem']['manifestUrl'] - manifest_url = self._download_json( - 'https://playback-api.b17g.net/media/' + video_id, - video_id, query={ - 'service': 'tv4', - 'device': 'browser', - 'protocol': 'hls', - })['playbackItem']['manifestUrl'] - formats = [] - subtitles = {} + formats, subtitles = [], {} fmts, subs = self._extract_m3u8_formats_and_subtitles( manifest_url, video_id, 'mp4', @@ -117,20 +126,24 @@ def _real_extract(self, url): subtitles = self._merge_subtitles(subtitles, subs) if not formats and info.get('is_geo_restricted'): - self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + self.raise_geo_restricted( + 'This video is not available from your location due to geo-restriction, or not being authenticated', + countries=['SE']) return { 'id': video_id, - 'title': title, 'formats': formats, 'subtitles': subtitles, - 'description': info.get('description'), - 'timestamp': parse_iso8601(info.get('broadcast_date_time')), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('image'), - 'is_live': info.get('isLive') is True, - 'series': info.get('seriesTitle'), - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode': info.get('episodeTitle'), - 'episode_number': int_or_none(info.get('episodeNumber')), + **traverse_obj(info, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': (('broadcast_date_time', 'broadcastDateTime'), {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'is_live': ('isLive', {bool_or_none}), + 'series': ('seriesTitle', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('episodeTitle', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + }, get_all=False), } diff --git a/plugin/yt-dlp/yt_dlp/extractor/tvp.py b/plugin/yt-dlp/yt_dlp/extractor/tvp.py index eb4f668..cd566b7 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/tvp.py +++ b/plugin/yt-dlp/yt_dlp/extractor/tvp.py @@ -488,9 +488,9 @@ def _call_api(self, resource, video_id, query={}, **kwargs): f'{self._API_BASE_URL}/{resource}', video_id, query={'lang': 'pl', 'platform': 'BROWSER', **query}, expected_status=lambda x: is_valid(x) or 400 <= x < 500, **kwargs) - if is_valid(urlh.status): + if is_valid(urlh.getcode()): return document - raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.status})') + raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.getcode()})') def _parse_video(self, video, with_url=True): info_dict = traverse_obj(video, { diff --git a/plugin/yt-dlp/yt_dlp/extractor/tvplay.py b/plugin/yt-dlp/yt_dlp/extractor/tvplay.py index a23b546..25c78b5 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/tvplay.py +++ b/plugin/yt-dlp/yt_dlp/extractor/tvplay.py @@ -30,10 +30,7 @@ class TVPlayIE(InfoExtractor): (?: tvplay(?:\.skaties)?\.lv(?:/parraides)?| (?:tv3play|play\.tv3)\.lt(?:/programos)?| - tv3play(?:\.tv3)?\.ee/sisu| - (?:tv(?:3|6|8|10)play)\.se/program| - (?:(?:tv3play|viasat4play|tv6play)\.no|(?:tv3play)\.dk)/programmer| - play\.nova(?:tv)?\.bg/programi + tv3play(?:\.tv3)?\.ee/sisu ) /(?:[^/]+/)+ ) @@ -92,117 +89,6 @@ class TVPlayIE(InfoExtractor): 'skip_download': True, }, }, - { - 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true', - 'info_dict': { - 'id': '395385', - 'ext': 'mp4', - 'title': 'Husräddarna S02E07', - 'description': 'md5:f210c6c89f42d4fc39faa551be813777', - 'duration': 2574, - 'timestamp': 1400596321, - 'upload_date': '20140520', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true', - 'info_dict': { - 'id': '266636', - 'ext': 'mp4', - 'title': 'Den sista dokusåpan S01E08', - 'description': 'md5:295be39c872520221b933830f660b110', - 'duration': 1492, - 'timestamp': 1330522854, - 'upload_date': '20120229', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true', - 'info_dict': { - 'id': '282756', - 'ext': 'mp4', - 'title': 'Antikjakten S01E10', - 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8', - 'duration': 2646, - 'timestamp': 1348575868, - 'upload_date': '20120925', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true', - 'info_dict': { - 'id': '230898', - 'ext': 'mp4', - 'title': 'Anna Anka søker assistent - Ep. 8', - 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474', - 'duration': 2656, - 'timestamp': 1277720005, - 'upload_date': '20100628', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true', - 'info_dict': { - 'id': '21873', - 'ext': 'mp4', - 'title': 'Budbringerne program 10', - 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d', - 'duration': 1297, - 'timestamp': 1254205102, - 'upload_date': '20090929', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true', - 'info_dict': { - 'id': '361883', - 'ext': 'mp4', - 'title': 'Hotelinspektør Alex Polizzi - Ep. 10', - 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81', - 'duration': 2594, - 'timestamp': 1393236292, - 'upload_date': '20140224', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true', - 'info_dict': { - 'id': '624952', - 'ext': 'flv', - 'title': 'Здравей, България (12.06.2015 г.) ', - 'description': 'md5:99f3700451ac5bb71a260268b8daefd7', - 'duration': 8838, - 'timestamp': 1434100372, - 'upload_date': '20150612', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - { - 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true', - 'only_matching': True, - }, { 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', 'only_matching': True, @@ -327,103 +213,6 @@ def _real_extract(self, url): } -class ViafreeIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - viafree\.(?P<country>dk|no|se|fi) - /(?P<id>(?:program(?:mer)?|ohjelmat)?/(?:[^/]+/)+[^/?#&]+) - ''' - _TESTS = [{ - 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', - 'info_dict': { - 'id': '757786', - 'ext': 'mp4', - 'title': 'Det beste vorspielet - Sesong 2 - Episode 1', - 'description': 'md5:b632cb848331404ccacd8cd03e83b4c3', - 'series': 'Det beste vorspielet', - 'season_number': 2, - 'duration': 1116, - 'timestamp': 1471200600, - 'upload_date': '20160814', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.viafree.dk/programmer/humor/comedy-central-roast-of-charlie-sheen/film/1047660', - 'info_dict': { - 'id': '1047660', - 'ext': 'mp4', - 'title': 'Comedy Central Roast of Charlie Sheen - Comedy Central Roast of Charlie Sheen', - 'description': 'md5:ec956d941ae9fd7c65a48fd64951dc6d', - 'series': 'Comedy Central Roast of Charlie Sheen', - 'season_number': 1, - 'duration': 3747, - 'timestamp': 1608246060, - 'upload_date': '20201217' - }, - 'params': { - 'skip_download': True - } - }, { - # with relatedClips - 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', - 'only_matching': True, - }, { - # Different og:image URL schema - 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', - 'only_matching': True, - }, { - 'url': 'https://www.viafree.fi/ohjelmat/entertainment/amazing-makeovers/kausi-7/jakso-2', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - country, path = self._match_valid_url(url).groups() - content = self._download_json( - 'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path) - program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program'] - guid = program['guid'] - meta = content['meta'] - title = meta['title'] - - try: - stream_href = self._download_json( - program['_links']['streamLink']['href'], guid, - headers=self.geo_verification_headers())['embedded']['prioritizedStreams'][0]['links']['stream']['href'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_geo_restricted(countries=[country]) - raise - - formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_href, guid, 'mp4') - episode = program.get('episode') or {} - return { - 'id': guid, - 'title': title, - 'thumbnail': meta.get('image'), - 'description': meta.get('description'), - 'series': episode.get('seriesTitle'), - 'subtitles': subtitles, - 'episode_number': int_or_none(episode.get('episodeNumber')), - 'season_number': int_or_none(episode.get('seasonNumber')), - 'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000), - 'timestamp': parse_iso8601(try_get(program, lambda x: x['availability']['start'])), - 'formats': formats, - } - - class TVPlayHomeIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// diff --git a/plugin/yt-dlp/yt_dlp/extractor/twitch.py b/plugin/yt-dlp/yt_dlp/extractor/twitch.py index 6376ccb..a1f70d1 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/twitch.py +++ b/plugin/yt-dlp/yt_dlp/extractor/twitch.py @@ -41,7 +41,6 @@ class TwitchBaseIE(InfoExtractor): _USHER_BASE = 'https://usher.ttvnw.net' _LOGIN_FORM_URL = 'https://www.twitch.tv/login' _LOGIN_POST_URL = 'https://passport.twitch.tv/login' - _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' _OPERATION_HASHES = { @@ -58,6 +57,11 @@ class TwitchBaseIE(InfoExtractor): 'VideoPlayer_VODSeekbarPreviewVideo': '07e99e4d56c5a7c67117a154777b0baf85a5ffefa393b213f4bc712ccaf85dd6', } + @property + def _CLIENT_ID(self): + return self._configuration_arg( + 'client_id', ['ue6666qo983tsx6so1t0vnawi233wa'], ie_key='Twitch', casesense=True)[0] + def _perform_login(self, username, password): def fail(message): raise ExtractorError( @@ -194,7 +198,8 @@ class TwitchVodIE(TwitchBaseIE): https?:// (?: (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| - player\.twitch\.tv/\?.*?\bvideo=v? + player\.twitch\.tv/\?.*?\bvideo=v?| + www\.twitch\.tv/[^/]+/schedule\?vodID= ) (?P<id>\d+) ''' @@ -363,6 +368,9 @@ class TwitchVodIE(TwitchBaseIE): 'skip_download': True }, 'expected_warnings': ['Unable to download JSON metadata: HTTP Error 403: Forbidden'] + }, { + 'url': 'https://www.twitch.tv/tangotek/schedule?vodID=1822395420', + 'only_matching': True, }] def _download_info(self, item_id): @@ -1075,7 +1083,7 @@ class TwitchClipsIE(TwitchBaseIE): https?:// (?: clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| - (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/)?clip/ ) (?P<id>[^/?#&]+) ''' @@ -1111,6 +1119,9 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/clip/FaintLightGullWholeWheat', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/plugin/yt-dlp/yt_dlp/extractor/twitter.py b/plugin/yt-dlp/yt_dlp/extractor/twitter.py index 07967ce..29a4b9f 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/twitter.py +++ b/plugin/yt-dlp/yt_dlp/extractor/twitter.py @@ -3,7 +3,6 @@ from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE -from ..compat import functools # isort: split from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, @@ -30,11 +29,67 @@ class TwitterBaseIE(InfoExtractor): + _NETRC_MACHINE = 'twitter' _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} _guest_token = None + _flow_token = None + + _LOGIN_INIT_DATA = json.dumps({ + 'input_flow_data': { + 'flow_context': { + 'debug_overrides': {}, + 'start_location': { + 'location': 'unknown' + } + } + }, + 'subtask_versions': { + 'action_list': 2, + 'alert_dialog': 1, + 'app_download_cta': 1, + 'check_logged_in_account': 1, + 'choice_selection': 3, + 'contacts_live_sync_permission_prompt': 0, + 'cta': 7, + 'email_verification': 2, + 'end_flow': 1, + 'enter_date': 1, + 'enter_email': 2, + 'enter_password': 5, + 'enter_phone': 2, + 'enter_recaptcha': 1, + 'enter_text': 5, + 'enter_username': 2, + 'generic_urt': 3, + 'in_app_notification': 1, + 'interest_picker': 3, + 'js_instrumentation': 1, + 'menu_dialog': 1, + 'notifications_permission_prompt': 2, + 'open_account': 2, + 'open_home_timeline': 1, + 'open_link': 1, + 'phone_verification': 4, + 'privacy_options': 1, + 'security_key': 3, + 'select_avatar': 4, + 'select_banner': 2, + 'settings_list': 7, + 'show_code': 1, + 'sign_up': 2, + 'sign_up_review': 4, + 'tweet_selection_urt': 1, + 'update_users': 1, + 'upload_media': 1, + 'user_recommendations_list': 4, + 'user_recommendations_urt': 1, + 'wait_spinner': 3, + 'web_modal': 1 + } + }, separators=(',', ':')).encode() def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') @@ -86,18 +141,151 @@ def _search_dimensions_in_video_url(a_format, video_url): 'height': int(m.group('height')), }) - @functools.cached_property + @property def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - def _call_api(self, path, video_id, query={}, graphql=False): - cookies = self._get_cookies(self._API_BASE) + def _fetch_guest_token(self, headers, display_id): + headers.pop('x-guest-token', None) + self._guest_token = traverse_obj(self._download_json( + f'{self._API_BASE}guest/activate.json', display_id, + 'Downloading guest token', data=b'', headers=headers), 'guest_token') + if not self._guest_token: + raise ExtractorError('Could not retrieve guest token') + + def _set_base_headers(self): headers = self._AUTH.copy() + csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) + if csrf_token: + headers['x-csrf-token'] = csrf_token + return headers + + def _call_login_api(self, note, headers, query={}, data=None): + response = self._download_json( + f'{self._API_BASE}onboarding/task.json', None, note, + headers=headers, query=query, data=data, expected_status=400) + error = traverse_obj(response, ('errors', 0, 'message', {str})) + if error: + raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True) + elif traverse_obj(response, 'status') != 'success': + raise ExtractorError('Login was unsuccessful') + + subtask = traverse_obj( + response, ('subtasks', ..., 'subtask_id', {str}), get_all=False) + if not subtask: + raise ExtractorError('Twitter API did not return next login subtask') + + self._flow_token = response['flow_token'] + + return subtask + + def _perform_login(self, username, password): + if self.is_logged_in: + return + + self._request_webpage('https://twitter.com/', None, 'Requesting cookies') + headers = self._set_base_headers() + self._fetch_guest_token(headers, None) + headers.update({ + 'content-type': 'application/json', + 'x-guest-token': self._guest_token, + 'x-twitter-client-language': 'en', + 'x-twitter-active-user': 'yes', + 'Referer': 'https://twitter.com/', + 'Origin': 'https://twitter.com', + }) - csrf_cookie = cookies.get('ct0') - if csrf_cookie: - headers['x-csrf-token'] = csrf_cookie.value + def build_login_json(*subtask_inputs): + return json.dumps({ + 'flow_token': self._flow_token, + 'subtask_inputs': subtask_inputs + }, separators=(',', ':')).encode() + def input_dict(subtask_id, text): + return { + 'subtask_id': subtask_id, + 'enter_text': { + 'text': text, + 'link': 'next_link' + } + } + + next_subtask = self._call_login_api( + 'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA) + + while not self.is_logged_in: + if next_subtask == 'LoginJsInstrumentationSubtask': + next_subtask = self._call_login_api( + 'Submitting JS instrumentation response', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'js_instrumentation': { + 'response': '{}', + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterUserIdentifierSSO': + next_subtask = self._call_login_api( + 'Submitting username', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'settings_list': { + 'setting_responses': [{ + 'key': 'user_identifier', + 'response_data': { + 'text_data': { + 'result': username + } + } + }], + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterAlternateIdentifierSubtask': + next_subtask = self._call_login_api( + 'Submitting alternate identifier', headers, + data=build_login_json(input_dict(next_subtask, self._get_tfa_info( + 'one of username, phone number or email that was not used as --username')))) + + elif next_subtask == 'LoginEnterPassword': + next_subtask = self._call_login_api( + 'Submitting password', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'enter_password': { + 'password': password, + 'link': 'next_link' + } + })) + + elif next_subtask == 'AccountDuplicationCheck': + next_subtask = self._call_login_api( + 'Submitting account duplication check', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'check_logged_in_account': { + 'link': 'AccountDuplicationCheck_false' + } + })) + + elif next_subtask == 'LoginTwoFactorAuthChallenge': + next_subtask = self._call_login_api( + 'Submitting 2FA token', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('two-factor authentication token')))) + + elif next_subtask == 'LoginAcid': + next_subtask = self._call_login_api( + 'Submitting confirmation code', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('confirmation code sent to your email or phone')))) + + elif next_subtask == 'LoginSuccessSubtask': + raise ExtractorError('Twitter API did not grant auth token cookie') + + else: + raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"') + + self.report_login() + + def _call_api(self, path, video_id, query={}, graphql=False): + headers = self._set_base_headers() if self.is_logged_in: headers.update({ 'x-twitter-auth-type': 'OAuth2Session', @@ -106,15 +294,10 @@ def _call_api(self, path, video_id, query={}, graphql=False): }) for first_attempt in (True, False): - if not self.is_logged_in and not self._guest_token: - headers.pop('x-guest-token', None) - self._guest_token = traverse_obj(self._download_json( - f'{self._API_BASE}guest/activate.json', video_id, - 'Downloading guest token', data=b'', headers=headers), 'guest_token') - if self._guest_token: + if not self.is_logged_in: + if not self._guest_token: + self._fetch_guest_token(headers, video_id) headers['x-guest-token'] = self._guest_token - elif not self.is_logged_in: - raise ExtractorError('Could not retrieve guest token') allowed_status = {400, 401, 403, 404} if graphql else {403} result = self._download_json( @@ -705,6 +888,7 @@ class TwitterIE(TwitterBaseIE): 'uploader': r're:Monique Camarra.+?', 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', + 'release_timestamp': 1658417414, 'description': 'md5:acce559345fd49f129c20dbcda3f1201', 'timestamp': 1658407771464, }, @@ -1327,6 +1511,8 @@ def _real_extract(self, url): 'uploader_id': traverse_obj( metadata, ('creator_results', 'result', 'legacy', 'screen_name')), 'live_status': live_status, + 'release_timestamp': try_call( + lambda: int_or_none(metadata['scheduled_start'], scale=1000)), 'timestamp': metadata.get('created_at'), 'formats': formats, } diff --git a/plugin/yt-dlp/yt_dlp/extractor/unsupported.py b/plugin/yt-dlp/yt_dlp/extractor/unsupported.py index 1aa3f0f..1da478a 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/unsupported.py +++ b/plugin/yt-dlp/yt_dlp/extractor/unsupported.py @@ -131,8 +131,9 @@ class KnownPiracyIE(UnsupportedInfoExtractor): URLS = ( r'dood\.(?:to|watch|so|pm|wf|re)', # Sites youtube-dl supports, but we won't - r'https://viewsb\.com', - r'https://filemoon\.sx', + r'viewsb\.com', + r'filemoon\.sx', + r'hentai\.animestigma\.com', ) _TESTS = [{ diff --git a/plugin/yt-dlp/yt_dlp/extractor/urplay.py b/plugin/yt-dlp/yt_dlp/extractor/urplay.py index e00bd97..9a3dfea 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/urplay.py +++ b/plugin/yt-dlp/yt_dlp/extractor/urplay.py @@ -112,18 +112,19 @@ def parse_lang_code(code): lang = ISO639Utils.short2long(lang) return lang or None - for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items(): - if (k in ('sd', 'hd') or not isinstance(v, dict)): - continue - lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) - if not sttl_url: - continue - lang = parse_lang_code(lang) - if not lang: - continue - sttl = subtitles.get(lang) or [] - sttl.append({'ext': k, 'url': sttl_url, }) - subtitles[lang] = sttl + for stream in urplayer_data['streamingInfo'].values(): + for k, v in stream.items(): + if (k in ('sd', 'hd') or not isinstance(v, dict)): + continue + lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) + if not sttl_url: + continue + lang = parse_lang_code(lang) + if not lang: + continue + sttl = subtitles.get(lang) or [] + sttl.append({'ext': k, 'url': sttl_url, }) + subtitles[lang] = sttl image = urplayer_data.get('image') or {} thumbnails = [] diff --git a/plugin/yt-dlp/yt_dlp/extractor/vidio.py b/plugin/yt-dlp/yt_dlp/extractor/vidio.py index bc8689f..a49d1e4 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/vidio.py +++ b/plugin/yt-dlp/yt_dlp/extractor/vidio.py @@ -39,7 +39,7 @@ def is_logged_in(): login_post, login_post_urlh = self._download_webpage_handle( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), expected_status=[302, 401]) - if login_post_urlh.status == 401: + if login_post_urlh.getcode() == 401: if get_element_by_class('onboarding-content-register-popup__title', login_post): raise ExtractorError( 'Unable to log in: The provided email has not registered yet.', expected=True) diff --git a/plugin/yt-dlp/yt_dlp/extractor/voot.py b/plugin/yt-dlp/yt_dlp/extractor/voot.py index c02c73c..6ab8c9f 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/voot.py +++ b/plugin/yt-dlp/yt_dlp/extractor/voot.py @@ -1,14 +1,86 @@ +import json +import time +import urllib.error +import uuid + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + float_or_none, int_or_none, + jwt_decode_hs256, + parse_age_limit, + traverse_obj, + try_call, try_get, - unified_timestamp, + unified_strdate, ) -class VootIE(InfoExtractor): +class VootBaseIE(InfoExtractor): + _NETRC_MACHINE = 'voot' + _GEO_BYPASS = False + _LOGIN_HINT = 'Log in with "-u <email_address> -p <password>", or use "-u token -p <auth_token>" to login with auth token.' + _TOKEN = None + _EXPIRY = 0 + _API_HEADERS = {'Origin': 'https://www.voot.com', 'Referer': 'https://www.voot.com/'} + + def _perform_login(self, username, password): + if self._TOKEN and self._EXPIRY: + return + + if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): + VootBaseIE._TOKEN = password + VootBaseIE._EXPIRY = jwt_decode_hs256(password)['exp'] + self.report_login() + + # Mobile number as username is not supported + elif not username.isdigit(): + check_username = self._download_json( + 'https://userauth.voot.com/usersV3/v3/checkUser', None, data=json.dumps({ + 'type': 'email', + 'email': username + }, separators=(',', ':')).encode(), headers={ + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + }, note='Checking username', expected_status=403) + if not traverse_obj(check_username, ('isExist', {bool})): + if traverse_obj(check_username, ('status', 'code', {int})) == 9999: + self.raise_geo_restricted(countries=['IN']) + raise ExtractorError('Incorrect username', expected=True) + auth_token = traverse_obj(self._download_json( + 'https://userauth.voot.com/usersV3/v3/login', None, data=json.dumps({ + 'type': 'traditional', + 'deviceId': str(uuid.uuid4()), + 'deviceBrand': 'PC/MAC', + 'data': { + 'email': username, + 'password': password + } + }, separators=(',', ':')).encode(), headers={ + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + }, note='Logging in', expected_status=400), ('data', 'authToken', {dict})) + if not auth_token: + raise ExtractorError('Incorrect password', expected=True) + VootBaseIE._TOKEN = auth_token['accessToken'] + VootBaseIE._EXPIRY = auth_token['expirationTime'] + + else: + raise ExtractorError(self._LOGIN_HINT, expected=True) + + def _check_token_expiry(self): + if int(time.time()) >= self._EXPIRY: + raise ExtractorError('Access token has expired', expected=True) + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(self._LOGIN_HINT, method=None) + self._check_token_expiry() + + +class VootIE(VootBaseIE): _VALID_URL = r'''(?x) (?: voot:| @@ -20,27 +92,25 @@ class VootIE(InfoExtractor): ) (?P<id>\d{3,}) ''' - _GEO_COUNTRIES = ['IN'] _TESTS = [{ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', 'info_dict': { - 'id': '0_8ledb18o', + 'id': '441353', 'ext': 'mp4', - 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', + 'title': 'Is this the end of Kamini?', 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', - 'timestamp': 1472162937, + 'timestamp': 1472103000, 'upload_date': '20160825', 'series': 'Ishq Ka Rang Safed', 'season_number': 1, 'episode': 'Is this the end of Kamini?', 'episode_number': 340, - 'view_count': int, - 'like_count': int, - }, - 'params': { - 'skip_download': True, + 'release_date': '20160825', + 'season': 'Season 1', + 'age_limit': 13, + 'duration': 1146.0, }, - 'expected_warnings': ['Failed to download m3u8 information'], + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', 'only_matching': True, @@ -55,59 +125,50 @@ class VootIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) media_info = self._download_json( - 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, - query={ - 'platform': 'Web', - 'pId': 2, - 'mediaId': video_id, - }) - - status_code = try_get(media_info, lambda x: x['status']['code'], int) - if status_code != 0: - raise ExtractorError(media_info['status']['message'], expected=True) - - media = media_info['assets'] - - entry_id = media['EntryId'] - title = media['MediaName'] - formats = self._extract_m3u8_formats( - 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, - video_id, 'mp4', m3u8_id='hls') - - description, series, season_number, episode, episode_number = [None] * 5 - - for meta in try_get(media, lambda x: x['Metas'], list) or []: - key, value = meta.get('Key'), meta.get('Value') - if not key or not value: - continue - if key == 'ContentSynopsis': - description = value - elif key == 'RefSeriesTitle': - series = value - elif key == 'RefSeriesSeason': - season_number = int_or_none(value) - elif key == 'EpisodeMainTitle': - episode = value - elif key == 'EpisodeNo': - episode_number = int_or_none(value) + 'https://psapi.voot.com/jio/voot/v1/voot-web/content/query/asset-details', video_id, + query={'ids': f'include:{video_id}', 'responseType': 'common'}, headers={'accesstoken': self._TOKEN}) + + try: + m3u8_url = self._download_json( + 'https://vootapi.media.jio.com/playback/v1/playbackrights', video_id, + 'Downloading playback JSON', data=b'{}', headers={ + **self.geo_verification_headers(), + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + 'platform': 'androidwebdesktop', + 'vootid': video_id, + 'voottoken': self._TOKEN, + })['m3u8'] + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 400: + self._check_token_expiry() + raise + + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._remove_duplicate_formats(formats) + return { - 'extractor_key': 'Kaltura', - 'id': entry_id, - 'title': title, - 'description': description, - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'timestamp': unified_timestamp(media.get('CreationDate')), - 'duration': int_or_none(media.get('Duration')), - 'view_count': int_or_none(media.get('ViewCounter')), - 'like_count': int_or_none(media.get('like_counter')), - 'formats': formats, + 'id': video_id, + # '/_definst_/smil:vod/' m3u8 manifests claim to have 720p+ formats but max out at 480p + 'formats': traverse_obj(formats, ( + lambda _, v: '/_definst_/smil:vod/' not in v['url'] or v['height'] <= 480)), + 'http_headers': self._API_HEADERS, + **traverse_obj(media_info, ('result', 0, { + 'title': ('fullTitle', {str}), + 'description': ('fullSynopsis', {str}), + 'series': ('showName', {str}), + 'season_number': ('season', {int_or_none}), + 'episode': ('fullTitle', {str}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('uploadTime', {int_or_none}), + 'release_date': ('telecastDate', {unified_strdate}), + 'age_limit': ('ageNemonic', {parse_age_limit}), + 'duration': ('duration', {float_or_none}), + })), } -class VootSeriesIE(InfoExtractor): +class VootSeriesIE(VootBaseIE): _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/[^/]+/(?P<id>\d{3,})' _TESTS = [{ 'url': 'https://www.voot.com/shows/chakravartin-ashoka-samrat/100002', diff --git a/plugin/yt-dlp/yt_dlp/extractor/vrt.py b/plugin/yt-dlp/yt_dlp/extractor/vrt.py index 6ef6f68..c4ea78e 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/vrt.py +++ b/plugin/yt-dlp/yt_dlp/extractor/vrt.py @@ -1,45 +1,137 @@ -from .common import InfoExtractor +import functools +import json +import time +import urllib.error +import urllib.parse + +from .gigya import GigyaBaseIE from ..utils import ( + ExtractorError, + clean_html, extract_attributes, float_or_none, get_element_by_class, + get_element_html_by_class, + int_or_none, + join_nonempty, + jwt_encode_hs256, + make_archive_id, + parse_age_limit, + parse_iso8601, + str_or_none, strip_or_none, - unified_timestamp, + traverse_obj, + url_or_none, + urlencode_postdata, ) -class VRTIE(InfoExtractor): +class VRTBaseIE(GigyaBaseIE): + _GEO_BYPASS = False + _PLAYER_INFO = { + 'platform': 'desktop', + 'app': { + 'type': 'browser', + 'name': 'Chrome', + }, + 'device': 'undefined (undefined)', + 'os': { + 'name': 'Windows', + 'version': 'x86_64' + }, + 'player': { + 'name': 'VRT web player', + 'version': '2.7.4-prod-2023-04-19T06:05:45' + } + } + # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.fd1de01a40a1e3d842ea.js + _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' + _JWT_SIGNING_KEY = '2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae' + + def _extract_formats_and_subtitles(self, data, video_id): + if traverse_obj(data, 'drm'): + self.report_drm(video_id) + + formats, subtitles = [], {} + for target in traverse_obj(data, ('targetUrls', lambda _, v: url_or_none(v['url']) and v['type'])): + format_type = target['type'].upper() + format_url = target['url'] + if format_type in ('HLS', 'HLS_AES'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id=format_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=format_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif format_type == 'HSS': + fmts, subs = self._extract_ism_formats_and_subtitles( + format_url, video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + + for sub in traverse_obj(data, ('subtitleUrls', lambda _, v: v['url'] and v['type'] == 'CLOSED')): + subtitles.setdefault('nl', []).append({'url': sub['url']}) + + return formats, subtitles + + def _call_api(self, video_id, client='null', id_token=None, version='v2'): + player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} + player_token = self._download_json( + 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', + video_id, 'Downloading player token', headers={ + **self.geo_verification_headers(), + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'identityToken': id_token or {}, + 'playerInfo': jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ + 'kid': self._JWT_KEY_ID + }).decode() + }, separators=(',', ':')).encode())['vrtPlayerToken'] + + return self._download_json( + f'https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id}', + video_id, 'Downloading API JSON', query={ + 'vrtPlayerToken': player_token, + 'client': client, + }, expected_status=400) + + +class VRTIE(VRTBaseIE): IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' _VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', - 'md5': 'e1663accf5cf13f375f3cd0d10476669', 'info_dict': { 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', 'ext': 'mp4', 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', - 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', - 'timestamp': 1557924660, - 'upload_date': '20190515', + 'description': 'md5:6fd85f999b2d1841aa5568f4bf02c3ff', 'duration': 31.2, + 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/2d914d61-7710-11e9-abcc-02b7b76bf47f.jpg', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', - 'md5': '910bba927566e9ab992278f647eb4b75', 'info_dict': { 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', 'ext': 'mp4', - 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', - 'timestamp': 1557923760, - 'upload_date': '20190515', + 'title': 'De Belgian Cats zijn klaar voor het EK', + 'description': 'Video: De Belgian Cats zijn klaar voor het EK mét Ann Wauters | basketbal, sport in het journaal', 'duration': 115.17, + 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/11c0dba3-770e-11e9-abcc-02b7b76bf47f.jpg', }, - }, { - 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/', - 'only_matching': True, - }, { - 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, }] _CLIENT_MAP = { 'vrt.be/vrtnws': 'vrtnieuws', @@ -49,34 +141,285 @@ class VRTIE(InfoExtractor): def _real_extract(self, url): site, display_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, display_id) - attrs = extract_attributes(self._search_regex( - r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video')) + attrs = extract_attributes(get_element_html_by_class('vrtvideo', webpage) or '') - asset_id = attrs['data-video-id'] - publication_id = attrs.get('data-publication-id') + asset_id = attrs.get('data-video-id') or attrs['data-videoid'] + publication_id = traverse_obj(attrs, 'data-publication-id', 'data-publicationid') if publication_id: - asset_id = publication_id + '$' + asset_id - client = attrs.get('data-client-code') or self._CLIENT_MAP[site] + asset_id = f'{publication_id}${asset_id}' + client = traverse_obj(attrs, 'data-client-code', 'data-client') or self._CLIENT_MAP[site] + + data = self._call_api(asset_id, client) + formats, subtitles = self._extract_formats_and_subtitles(data, asset_id) - title = strip_or_none(get_element_by_class( - 'vrt-title', webpage) or self._html_search_meta( - ['og:title', 'twitter:title', 'name'], webpage)) description = self._html_search_meta( ['og:description', 'twitter:description', 'description'], webpage) if description == '…': description = None - timestamp = unified_timestamp(self._html_search_meta( - 'article:published_time', webpage)) return { - '_type': 'url_transparent', 'id': asset_id, - 'display_id': display_id, - 'title': title, + 'formats': formats, + 'subtitles': subtitles, 'description': description, - 'thumbnail': attrs.get('data-posterimage'), - 'timestamp': timestamp, + 'thumbnail': url_or_none(attrs.get('data-posterimage')), 'duration': float_or_none(attrs.get('data-duration'), 1000), - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), - 'ie_key': 'Canvas', + '_old_archive_ids': [make_archive_id('Canvas', asset_id)], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('shortDescription', {str}), + 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'thumbnail': ('posterImageUrl', {url_or_none}), + }), + } + + +class VrtNUIE(VRTBaseIE): + IE_DESC = 'VRT MAX' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' + _TESTS = [{ + # CONTENT_IS_AGE_RESTRICTED + 'url': 'https://www.vrt.be/vrtnu/a-z/de-ideale-wereld/2023-vj/de-ideale-wereld-d20230116/', + 'info_dict': { + 'id': 'pbs-pub-855b00a8-6ce2-4032-ac4f-1fcf3ae78524$vid-d2243aa1-ec46-4e34-a55b-92568459906f', + 'ext': 'mp4', + 'title': 'Tom Waes', + 'description': 'Satirisch actualiteitenmagazine met Ella Leyers. Tom Waes is te gast.', + 'timestamp': 1673905125, + 'release_timestamp': 1673905125, + 'series': 'De ideale wereld', + 'season_id': '1672830988794', + 'episode': 'Aflevering 1', + 'episode_number': 1, + 'episode_id': '1672830988861', + 'display_id': 'de-ideale-wereld-d20230116', + 'channel': 'VRT', + 'duration': 1939.0, + 'thumbnail': 'https://images.vrt.be/orig/2023/01/10/1bb39cb3-9115-11ed-b07d-02b7b76bf47f.jpg', + 'release_date': '20230116', + 'upload_date': '20230116', + 'age_limit': 12, + }, + }, { + 'url': 'https://www.vrt.be/vrtnu/a-z/buurman--wat-doet-u-nu-/6/buurman--wat-doet-u-nu--s6-trailer/', + 'info_dict': { + 'id': 'pbs-pub-ad4050eb-d9e5-48c2-9ec8-b6c355032361$vid-0465537a-34a8-4617-8352-4d8d983b4eee', + 'ext': 'mp4', + 'title': 'Trailer seizoen 6 \'Buurman, wat doet u nu?\'', + 'description': 'md5:197424726c61384b4e5c519f16c0cf02', + 'timestamp': 1652940000, + 'release_timestamp': 1652940000, + 'series': 'Buurman, wat doet u nu?', + 'season': 'Seizoen 6', + 'season_number': 6, + 'season_id': '1652344200907', + 'episode': 'Aflevering 0', + 'episode_number': 0, + 'episode_id': '1652951873524', + 'display_id': 'buurman--wat-doet-u-nu--s6-trailer', + 'channel': 'VRT', + 'duration': 33.13, + 'thumbnail': 'https://images.vrt.be/orig/2022/05/23/3c234d21-da83-11ec-b07d-02b7b76bf47f.jpg', + 'release_date': '20220519', + 'upload_date': '20220519', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _NETRC_MACHINE = 'vrtnu' + _authenticated = False + + def _perform_login(self, username, password): + auth_info = self._gigya_login({ + 'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy', + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) + + if auth_info.get('errorDetails'): + raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True) + + # Sometimes authentication fails for no good reason, retry + for retry in self.RetryManager(): + if retry.attempt > 1: + self._sleep(1, None) + try: + self._request_webpage( + 'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token', + errnote='Could not get XSRF Token', query={ + 'provider': 'site', + 'destination': 'https://www.vrt.be/vrtnu/', + }) + self._request_webpage( + 'https://login.vrt.be/perform_login', None, + note='Performing login', errnote='Login failed', + query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({ + 'UID': auth_info['UID'], + 'UIDSignature': auth_info['UIDSignature'], + 'signatureTimestamp': auth_info['signatureTimestamp'], + '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + })) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + retry.error = e + continue + raise + + self._authenticated = True + + def _real_extract(self, url): + display_id = self._match_id(url) + parsed_url = urllib.parse.urlparse(url) + details = self._download_json( + f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json', + display_id, 'Downloading asset JSON', 'Unable to download asset JSON')['details'] + + watch_info = traverse_obj(details, ( + 'actions', lambda _, v: v['type'] == 'watch-episode', {dict}), get_all=False) or {} + video_id = join_nonempty( + 'episodePublicationId', 'episodeVideoId', delim='$', from_dict=watch_info) + if '$' not in video_id: + raise ExtractorError('Unable to extract video ID') + + vrtnutoken = self._download_json( + 'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken', + errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None + + video_info = self._call_api(video_id, 'vrtnu-web@PROD', vrtnutoken) + + if 'title' not in video_info: + code = video_info.get('code') + if code in ('AUTHENTICATION_REQUIRED', 'CONTENT_IS_AGE_RESTRICTED'): + self.raise_login_required(code, method='password') + elif code in ('INVALID_LOCATION', 'CONTENT_AVAILABLE_ONLY_IN_BE'): + self.raise_geo_restricted(countries=['BE']) + elif code == 'CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS_AND_EXPATS': + if not self._authenticated: + self.raise_login_required(code, method='password') + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(code, expected=True) + + formats, subtitles = self._extract_formats_and_subtitles(video_info, video_id) + + return { + **traverse_obj(details, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), + 'release_timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), + 'series': ('data', 'program', 'title'), + 'season': ('data', 'season', 'title', 'value'), + 'season_number': ('data', 'season', 'title', 'raw', {int_or_none}), + 'season_id': ('data', 'season', 'id', {str_or_none}), + 'episode': ('data', 'episode', 'number', 'value', {str_or_none}), + 'episode_number': ('data', 'episode', 'number', 'raw', {int_or_none}), + 'episode_id': ('data', 'episode', 'id', {str_or_none}), + 'age_limit': ('data', 'episode', 'age', 'raw', {parse_age_limit}), + }), + 'id': video_id, + 'display_id': display_id, + 'channel': 'VRT', + 'formats': formats, + 'duration': float_or_none(video_info.get('duration'), 1000), + 'thumbnail': url_or_none(video_info.get('posterImageUrl')), + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id('Canvas', video_id)], + } + + +class KetnetIE(VRTBaseIE): + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5', + 'info_dict': { + 'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + 'ext': 'mp4', + 'title': 'Meisjes', + 'episode': 'Reeks 6: Week 5', + 'season': 'Reeks 6', + 'series': 'Meisjes', + 'timestamp': 1685251800, + 'upload_date': '20230528', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + video = self._download_json( + 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'query': '''{ + video(id: "content/ketnet/nl/%s.model.json") { + description + episodeNr + imageUrl + mediaReference + programTitle + publicationDate + seasonTitle + subtitleVideodetail + titleVideodetail + } +}''' % display_id, + })['data']['video'] + + video_id = urllib.parse.unquote(video['mediaReference']) + data = self._call_api(video_id, 'ketnet@PROD', version='v1') + formats, subtitles = self._extract_formats_and_subtitles(data, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id('Canvas', video_id)], + **traverse_obj(video, { + 'title': ('titleVideodetail', {str}), + 'description': ('description', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'timestamp': ('publicationDate', {parse_iso8601}), + 'series': ('programTitle', {str}), + 'season': ('seasonTitle', {str}), + 'episode': ('subtitleVideodetail', {str}), + 'episode_number': ('episodeNr', {int_or_none}), + }), + } + + +class DagelijkseKostIE(VRTBaseIE): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'display_id': 'hachis-parmentier-met-witloof', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id') + + data = self._call_api(video_id, 'dako@prod', version='v1') + formats, subtitles = self._extract_formats_and_subtitles(data, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'display_id': display_id, + 'title': strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage) or self._html_search_meta('twitter:title', webpage)), + 'description': clean_html(get_element_by_class( + 'dish-description', webpage)) or self._html_search_meta( + ['description', 'twitter:description', 'og:description'], webpage), + '_old_archive_ids': [make_archive_id('Canvas', video_id)], } diff --git a/plugin/yt-dlp/yt_dlp/extractor/weverse.py b/plugin/yt-dlp/yt_dlp/extractor/weverse.py new file mode 100644 index 0000000..8f2a7ee --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/weverse.py @@ -0,0 +1,607 @@ +import base64 +import hashlib +import hmac +import itertools +import json +import re +import time +import urllib.error +import urllib.parse +import uuid + +from .common import InfoExtractor +from .naver import NaverBaseIE +from .youtube import YoutubeIE +from ..utils import ( + ExtractorError, + UserNotLive, + float_or_none, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, + url_or_none, +) + + +class WeverseBaseIE(InfoExtractor): + _NETRC_MACHINE = 'weverse' + _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api/v2' + _API_HEADERS = { + 'Referer': 'https://weverse.io/', + 'WEV-device-Id': str(uuid.uuid4()), + } + + def _perform_login(self, username, password): + if self._API_HEADERS.get('Authorization'): + return + + headers = { + 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a', + 'x-acc-app-version': '2.2.6', + 'x-acc-language': 'en', + 'x-acc-service-id': 'weverse', + 'x-acc-trace-id': str(uuid.uuid4()), + 'x-clog-user-device-id': str(uuid.uuid4()), + } + check_username = self._download_json( + f'{self._ACCOUNT_API_BASE}/signup/email/status', None, + note='Checking username', query={'email': username}, headers=headers) + if not check_username.get('hasPassword'): + raise ExtractorError('Invalid username provided', expected=True) + + headers['content-type'] = 'application/json' + try: + auth = self._download_json( + f'{self._ACCOUNT_API_BASE}/auth/token/by-credentials', None, data=json.dumps({ + 'email': username, + 'password': password, + }, separators=(',', ':')).encode(), headers=headers, note='Logging in') + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + raise ExtractorError('Invalid password provided', expected=True) + raise + + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {auth["accessToken"]}' + + def _real_initialize(self): + if self._API_HEADERS.get('Authorization'): + return + + token = try_call(lambda: self._get_cookies('https://weverse.io/')['we2_access_token'].value) + if not token: + self.raise_login_required() + + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {token}' + + def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): + # Ref: https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/2488.a09b41ff.chunk.js + # From https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/main.e206f7c1.js: + key = b'1b9cb6378d959b45714bec49971ade22e6e24e42' + api_path = update_url_query(ep, { + 'appId': 'be4d79eb8fc7bd008ee82c8ec4ff6fd4', + 'language': 'en', + 'platform': 'WEB', + 'wpf': 'pc', + }) + wmsgpad = int(time.time() * 1000) + wmd = base64.b64encode(hmac.HMAC( + key, f'{api_path[:255]}{wmsgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode() + headers = {'Content-Type': 'application/json'} if data else {} + try: + return self._download_json( + f'https://global.apis.naver.com/weverse/wevweb{api_path}', video_id, note=note, + data=data, headers={**self._API_HEADERS, **headers}, query={ + 'wmsgpad': wmsgpad, + 'wmd': wmd, + }) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + self.raise_login_required( + 'Session token has expired. Log in again or refresh cookies in browser') + elif isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + raise ExtractorError('Your account does not have access to this content', expected=True) + raise + + def _call_post_api(self, video_id): + return self._call_api(f'/post/v1.0/post-{video_id}?fieldSet=postV1', video_id) + + def _get_community_id(self, channel): + return str(self._call_api( + f'/community/v1.0/communityIdUrlPathByUrlPathArtistCode?keyword={channel}', + channel, note='Fetching community ID')['communityId']) + + def _get_formats(self, data, video_id): + formats = traverse_obj(data, ('videos', 'list', lambda _, v: url_or_none(v['source']), { + 'url': 'source', + 'width': ('encodingOption', 'width', {int_or_none}), + 'height': ('encodingOption', 'height', {int_or_none}), + 'vcodec': 'type', + 'vbr': ('bitrate', 'video', {int_or_none}), + 'abr': ('bitrate', 'audio', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'format_id': ('encodingOption', 'id', {str_or_none}), + })) + + for stream in traverse_obj(data, ('streams', lambda _, v: v['type'] == 'HLS' and url_or_none(v['source']))): + query = {} + for param in traverse_obj(stream, ('keys', lambda _, v: v['type'] == 'param' and v['name'])): + query[param['name']] = param.get('value', '') + fmts = self._extract_m3u8_formats( + stream['source'], video_id, 'mp4', m3u8_id='hls', fatal=False, query=query) + if query: + for fmt in fmts: + fmt['url'] = update_url_query(fmt['url'], query) + fmt['extra_param_to_segment_url'] = urllib.parse.urlencode(query) + formats.extend(fmts) + + return formats + + def _get_subs(self, caption_url): + subs_ext_re = r'\.(?:ttml|vtt)' + replace_ext = lambda x, y: re.sub(subs_ext_re, y, x) + if re.search(subs_ext_re, caption_url): + return [replace_ext(caption_url, '.ttml'), replace_ext(caption_url, '.vtt')] + return [caption_url] + + def _parse_post_meta(self, metadata): + return traverse_obj(metadata, { + 'title': ((('extension', 'mediaInfo', 'title'), 'title'), {str}), + 'description': ((('extension', 'mediaInfo', 'body'), 'body'), {str}), + 'uploader': ('author', 'profileName', {str}), + 'uploader_id': ('author', 'memberId', {str}), + 'creator': ('community', 'communityName', {str}), + 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), + 'duration': ('extension', 'video', 'playTime', {float_or_none}), + 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), + 'release_timestamp': ('extension', 'video', 'onAirStartAt', {lambda x: int_or_none(x, 1000)}), + 'thumbnail': ('extension', (('mediaInfo', 'thumbnail', 'url'), ('video', 'thumb')), {url_or_none}), + 'view_count': ('extension', 'video', 'playCount', {int_or_none}), + 'like_count': ('extension', 'video', 'likeCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False) + + def _extract_availability(self, data): + return self._availability(**traverse_obj(data, ((('extension', 'video'), None), { + 'needs_premium': 'paid', + 'needs_subscription': 'membershipOnly', + }), get_all=False, expected_type=bool), needs_auth=True) + + def _extract_live_status(self, data): + data = traverse_obj(data, ('extension', 'video', {dict})) or {} + if data.get('type') == 'LIVE': + return traverse_obj({ + 'ONAIR': 'is_live', + 'DONE': 'post_live', + 'STANDBY': 'is_upcoming', + 'DELAY': 'is_upcoming', + }, (data.get('status'), {str})) or 'is_live' + return 'was_live' if data.get('liveToVod') else 'not_live' + + +class WeverseIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/live/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/live/0-107323480', + 'md5': '1fa849f00181eef9100d3c8254c47979', + 'info_dict': { + 'id': '0-107323480', + 'ext': 'mp4', + 'title': '행복한 평이루💜', + 'description': '', + 'uploader': 'Billlie', + 'uploader_id': '5ae14aed7b7cdc65fa87c41fe06cc936', + 'channel': 'billlie', + 'channel_id': '72', + 'channel_url': 'https://weverse.io/billlie', + 'creator': 'Billlie', + 'timestamp': 1666262062, + 'upload_date': '20221020', + 'release_timestamp': 1666262058, + 'release_date': '20221020', + 'duration': 3102, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://weverse.io/lesserafim/live/2-102331763', + 'md5': 'e46125c08b13a6c8c1f4565035cca987', + 'info_dict': { + 'id': '2-102331763', + 'ext': 'mp4', + 'title': '🎂김채원 생신🎂', + 'description': '🎂김채원 생신🎂', + 'uploader': 'LE SSERAFIM ', + 'uploader_id': 'd26ddc1e258488a0a2b795218d14d59d', + 'channel': 'lesserafim', + 'channel_id': '47', + 'channel_url': 'https://weverse.io/lesserafim', + 'creator': 'LE SSERAFIM', + 'timestamp': 1659353400, + 'upload_date': '20220801', + 'release_timestamp': 1659353400, + 'release_date': '20220801', + 'duration': 3006, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'was_live', + 'subtitles': { + 'id_ID': 'count:2', + 'en_US': 'count:2', + 'es_ES': 'count:2', + 'vi_VN': 'count:2', + 'th_TH': 'count:2', + 'zh_CN': 'count:2', + 'zh_TW': 'count:2', + 'ja_JP': 'count:2', + 'ko_KR': 'count:2', + }, + }, + }, { + 'url': 'https://weverse.io/treasure/live/2-117230416', + 'info_dict': { + 'id': '2-117230416', + 'ext': 'mp4', + 'title': r're:스껄도려님 첫 스무살 생파🦋', + 'description': '', + 'uploader': 'TREASURE', + 'uploader_id': '77eabbc449ca37f7970054a136f60082', + 'channel': 'treasure', + 'channel_id': '20', + 'channel_url': 'https://weverse.io/treasure', + 'creator': 'TREASURE', + 'timestamp': 1680667651, + 'upload_date': '20230405', + 'release_timestamp': 1680667639, + 'release_date': '20230405', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', + }] + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('artist', 'id') + post = self._call_post_api(video_id) + api_video_id = post['extension']['video']['videoId'] + availability = self._extract_availability(post) + live_status = self._extract_live_status(post) + video_info, formats = {}, [] + + if live_status == 'is_upcoming': + self.raise_no_formats('Livestream has not yet started', expected=True) + + elif live_status == 'is_live': + video_info = self._call_api( + f'/video/v1.0/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + video_id, note='Downloading live JSON') + playback = self._parse_json(video_info['lipPlayback'], video_id) + m3u8_url = traverse_obj(playback, ( + 'media', lambda _, v: v['protocol'] == 'HLS', 'path', {url_or_none}), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) + + elif live_status == 'post_live': + if availability in ('premium_only', 'subscriber_only'): + self.report_drm(video_id) + self.raise_no_formats( + 'Livestream has ended and downloadable VOD is not available', expected=True) + + else: + infra_video_id = post['extension']['video']['infraVideoId'] + in_key = self._call_api( + f'/video/v1.0/vod/{api_video_id}/inKey?preview=false', video_id, + data=b'{}', note='Downloading VOD API key')['inKey'] + + video_info = self._download_json( + f'https://global.apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{infra_video_id}', + video_id, note='Downloading VOD JSON', query={ + 'key': in_key, + 'sid': traverse_obj(post, ('extension', 'video', 'serviceId')) or '2070', + 'pid': str(uuid.uuid4()), + 'nonce': int(time.time() * 1000), + 'devt': 'html5_pc', + 'prv': 'Y' if post.get('membershipOnly') else 'N', + 'aup': 'N', + 'stpb': 'N', + 'cpl': 'en', + 'env': 'prod', + 'lc': 'en', + 'adi': '[{"adSystem":"null"}]', + 'adu': '/', + }) + + formats = self._get_formats(video_info, video_id) + has_drm = traverse_obj(video_info, ('meta', 'provider', 'name', {str.lower})) == 'drm' + if has_drm and formats: + self.report_warning( + 'Requested content is DRM-protected, only a 30-second preview is available', video_id) + elif has_drm and not formats: + self.report_drm(video_id) + + return { + 'id': video_id, + 'channel': channel, + 'channel_url': f'https://weverse.io/{channel}', + 'formats': formats, + 'availability': availability, + 'live_status': live_status, + **self._parse_post_meta(post), + **NaverBaseIE.process_subtitles(video_info, self._get_subs), + } + + +class WeverseMediaIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/media/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/media/4-116372884', + 'md5': '8efc9cfd61b2f25209eb1a5326314d28', + 'info_dict': { + 'id': 'e-C9wLSQs6o', + 'ext': 'mp4', + 'title': 'Billlie | \'EUNOIA\' Performance Video (heartbeat ver.)', + 'description': 'md5:6181caaf2a2397bca913ffe368c104e5', + 'channel': 'Billlie', + 'channel_id': 'UCyc9sUCxELTDK9vELO5Fzeg', + 'channel_url': 'https://www.youtube.com/channel/UCyc9sUCxELTDK9vELO5Fzeg', + 'uploader': 'Billlie', + 'uploader_id': '@Billlie', + 'uploader_url': 'http://www.youtube.com/@Billlie', + 'upload_date': '20230403', + 'duration': 211, + 'age_limit': 0, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'availability': 'public', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/e-C9wLSQs6o/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': 'count:7', + }, + }, { + 'url': 'https://weverse.io/billlie/media/3-102914520', + 'md5': '031551fcbd716bc4f080cb6174a43d8a', + 'info_dict': { + 'id': '3-102914520', + 'ext': 'mp4', + 'title': 'From. SUHYEON🌸', + 'description': 'Billlie 멤버별 독점 영상 공개💙💜', + 'uploader': 'Billlie_official', + 'uploader_id': 'f569c6e92f7eaffef0a395037dcaa54f', + 'channel': 'billlie', + 'channel_id': '72', + 'channel_url': 'https://weverse.io/billlie', + 'creator': 'Billlie', + 'timestamp': 1662174000, + 'upload_date': '20220903', + 'release_timestamp': 1662174000, + 'release_date': '20220903', + 'duration': 17.0, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'not_live', + }, + }] + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('artist', 'id') + post = self._call_post_api(video_id) + media_type = traverse_obj(post, ('extension', 'mediaInfo', 'mediaType', {str.lower})) + youtube_id = traverse_obj(post, ('extension', 'youtube', 'youtubeVideoId', {str})) + + if media_type == 'vod': + return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE) + elif media_type == 'youtube' and youtube_id: + return self.url_result(youtube_id, YoutubeIE) + elif media_type == 'image': + self.raise_no_formats('No video content found in webpage', expected=True) + elif media_type: + raise ExtractorError(f'Unsupported media type "{media_type}"') + + self.raise_no_formats('No video content found in webpage') + + +class WeverseMomentIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/moment/(?P<uid>[\da-f]+)/post/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/secretnumber/moment/66a07e164b56a696ee71c99315ffe27b/post/1-117229444', + 'md5': '87733ac19a54081b7dfc2442036d282b', + 'info_dict': { + 'id': '1-117229444', + 'ext': 'mp4', + 'title': '今日もめっちゃいい天気☀️🌤️', + 'uploader': '레아', + 'uploader_id': '66a07e164b56a696ee71c99315ffe27b', + 'channel': 'secretnumber', + 'channel_id': '56', + 'creator': 'SECRET NUMBER', + 'duration': 10, + 'upload_date': '20230405', + 'timestamp': 1680653968, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + }, + 'skip': 'Moment has expired', + }] + + def _real_extract(self, url): + channel, uploader_id, video_id = self._match_valid_url(url).group('artist', 'uid', 'id') + post = self._call_post_api(video_id) + api_video_id = post['extension']['moment']['video']['videoId'] + video_info = self._call_api( + f'/cvideo/v1.0/cvideo-{api_video_id}/playInfo?videoId={api_video_id}', video_id, + note='Downloading moment JSON')['playInfo'] + + return { + 'id': video_id, + 'channel': channel, + 'uploader_id': uploader_id, + 'formats': self._get_formats(video_info, video_id), + 'availability': self._extract_availability(post), + **traverse_obj(post, { + 'title': ((('extension', 'moment', 'body'), 'body'), {str}), + 'uploader': ('author', 'profileName', {str}), + 'creator': (('community', 'author'), 'communityName', {str}), + 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), + 'duration': ('extension', 'moment', 'video', 'uploadInfo', 'playTime', {float_or_none}), + 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), + 'thumbnail': ('extension', 'moment', 'video', 'uploadInfo', 'imageUrl', {url_or_none}), + 'like_count': ('emotionCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False), + **NaverBaseIE.process_subtitles(video_info, self._get_subs), + } + + +class WeverseTabBaseIE(WeverseBaseIE): + _ENDPOINT = None + _PATH = None + _QUERY = {} + _RESULT_IE = None + + def _entries(self, channel_id, channel, first_page): + query = self._QUERY.copy() + + for page in itertools.count(1): + posts = first_page if page == 1 else self._call_api( + update_url_query(self._ENDPOINT % channel_id, query), channel, + note=f'Downloading {self._PATH} tab page {page}') + + for post in traverse_obj(posts, ('data', lambda _, v: v['postId'])): + yield self.url_result( + f'https://weverse.io/{channel}/{self._PATH}/{post["postId"]}', + self._RESULT_IE, post['postId'], **self._parse_post_meta(post), + channel=channel, channel_url=f'https://weverse.io/{channel}', + availability=self._extract_availability(post), + live_status=self._extract_live_status(post)) + + query['after'] = traverse_obj(posts, ('paging', 'nextParams', 'after', {str})) + if not query['after']: + break + + def _real_extract(self, url): + channel = self._match_id(url) + channel_id = self._get_community_id(channel) + + first_page = self._call_api( + update_url_query(self._ENDPOINT % channel_id, self._QUERY), channel, + note=f'Downloading {self._PATH} tab page 1') + + return self.playlist_result( + self._entries(channel_id, channel, first_page), f'{channel}-{self._PATH}', + **traverse_obj(first_page, ('data', ..., { + 'playlist_title': ('community', 'communityName', {str}), + 'thumbnail': ('author', 'profileImageUrl', {url_or_none}), + }), get_all=False)) + + +class WeverseLiveTabIE(WeverseTabBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/live/', + 'playlist_mincount': 55, + 'info_dict': { + 'id': 'billlie-live', + 'title': 'Billlie', + 'thumbnail': r're:^https?://.*\.jpe?g$', + }, + }] + + _ENDPOINT = '/post/v1.0/community-%s/liveTabPosts' + _PATH = 'live' + _QUERY = {'fieldSet': 'postsV1'} + _RESULT_IE = WeverseIE + + +class WeverseMediaTabIE(WeverseTabBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/media/', + 'playlist_mincount': 231, + 'info_dict': { + 'id': 'billlie-media', + 'title': 'Billlie', + 'thumbnail': r're:^https?://.*\.jpe?g$', + }, + }, { + 'url': 'https://weverse.io/lesserafim/media/all', + 'only_matching': True, + }, { + 'url': 'https://weverse.io/lesserafim/media/new', + 'only_matching': True, + }] + + _ENDPOINT = '/media/v1.0/community-%s/more' + _PATH = 'media' + _QUERY = {'fieldSet': 'postsV1', 'filterType': 'RECENT'} + _RESULT_IE = WeverseMediaIE + + +class WeverseLiveIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/purplekiss', + 'info_dict': { + 'id': '3-116560493', + 'ext': 'mp4', + 'title': r're:모하냥🫶🏻', + 'description': '내일은 금요일~><', + 'uploader': '채인', + 'uploader_id': '1ffb1d9d904d6b3db2783f876eb9229d', + 'channel': 'purplekiss', + 'channel_id': '35', + 'channel_url': 'https://weverse.io/purplekiss', + 'creator': 'PURPLE KISS', + 'timestamp': 1680780892, + 'upload_date': '20230406', + 'release_timestamp': 1680780883, + 'release_date': '20230406', + 'thumbnail': 'https://weverse-live.pstatic.net/v1.0/live/62044/thumb', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', + }, { + 'url': 'https://weverse.io/billlie/', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel = self._match_id(url) + channel_id = self._get_community_id(channel) + + video_id = traverse_obj( + self._call_api(update_url_query(f'/post/v1.0/community-{channel_id}/liveTab', { + 'debugMessage': 'true', + 'fields': 'onAirLivePosts.fieldSet(postsV1).limit(10),reservedLivePosts.fieldSet(postsV1).limit(10)', + }), channel, note='Downloading live JSON'), ( + ('onAirLivePosts', 'reservedLivePosts'), 'data', + lambda _, v: self._extract_live_status(v) in ('is_live', 'is_upcoming'), 'postId', {str}), + get_all=False) + + if not video_id: + raise UserNotLive(video_id=channel) + + return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE) diff --git a/plugin/yt-dlp/yt_dlp/extractor/weyyak.py b/plugin/yt-dlp/yt_dlp/extractor/weyyak.py new file mode 100644 index 0000000..ef12be8 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/weyyak.py @@ -0,0 +1,86 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + parse_age_limit, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class WeyyakIE(InfoExtractor): + _VALID_URL = r'https?://weyyak\.com/(?P<lang>\w+)/(?:player/)?(?P<type>episode|movie)/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://weyyak.com/en/player/episode/1341952/Ribat-Al-Hob-Episode49', + 'md5': '0caf55c1a615531c8fe60f146ae46849', + 'info_dict': { + 'id': '1341952', + 'ext': 'mp4', + 'title': 'Ribat Al Hob', + 'duration': 2771, + 'alt_title': 'رباط الحب', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 49', + 'episode_number': 49, + 'timestamp': 1485907200, + 'upload_date': '20170201', + 'thumbnail': r're:^https://content\.weyyak\.com/.+/poster-image', + 'categories': ['Drama', 'Thrillers', 'Romance'], + 'tags': 'count:8', + }, + }, + { + 'url': 'https://weyyak.com/en/movie/233255/8-Seconds', + 'md5': 'fe740ae0f63e4d1c8a7fc147a410c564', + 'info_dict': { + 'id': '233255', + 'ext': 'mp4', + 'title': '8 Seconds', + 'duration': 6490, + 'alt_title': '8 ثواني', + 'description': 'md5:45b83a155c30b49950624c7e99600b9d', + 'age_limit': 15, + 'release_year': 2015, + 'timestamp': 1683106031, + 'upload_date': '20230503', + 'thumbnail': r're:^https://content\.weyyak\.com/.+/poster-image', + 'categories': ['Drama', 'Social'], + 'cast': ['Ceylin Adiyaman', 'Esra Inal'], + }, + }, + ] + + def _real_extract(self, url): + video_id, lang, type_ = self._match_valid_url(url).group('id', 'lang', 'type') + + path = 'episode/' if type_ == 'episode' else 'contents/moviedetails?contentkey=' + data = self._download_json( + f'https://msapifo-prod-me.weyyak.z5.com/v1/{lang}/{path}{video_id}', video_id)['data'] + m3u8_url = self._download_json( + f'https://api-weyyak.akamaized.net/get_info/{data["video_id"]}', + video_id, 'Extracting video details')['url_video'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'alt_title': ('translated_title', {str}), + 'description': ('synopsis', {str}), + 'duration': ('length', {float_or_none}), + 'age_limit': ('age_rating', {parse_age_limit}), + 'season_number': ('season_number', {int_or_none}), + 'episode_number': ('episode_number', {int_or_none}), + 'thumbnail': ('imagery', 'thumbnail', {url_or_none}), + 'categories': ('genres', ..., {str}), + 'tags': ('tags', ..., {str}), + 'cast': (('main_actor', 'main_actress'), {str}), + 'timestamp': ('insertedAt', {unified_timestamp}), + 'release_year': ('production_year', {int_or_none}), + }), + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/wrestleuniverse.py b/plugin/yt-dlp/yt_dlp/extractor/wrestleuniverse.py index a9c0399..43960ae 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/wrestleuniverse.py +++ b/plugin/yt-dlp/yt_dlp/extractor/wrestleuniverse.py @@ -41,7 +41,7 @@ def _TOKEN(self): token = try_call(lambda: self._get_cookies('https://www.wrestle-universe.com/')['token'].value) if not token and not self._REFRESH_TOKEN: self.raise_login_required() - self._REAL_TOKEN = token + self._TOKEN = token if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()): if not self._REFRESH_TOKEN: diff --git a/plugin/yt-dlp/yt_dlp/extractor/wykop.py b/plugin/yt-dlp/yt_dlp/extractor/wykop.py new file mode 100644 index 0000000..0fa6d52 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/wykop.py @@ -0,0 +1,268 @@ +import json +import urllib.error + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + format_field, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class WykopBaseExtractor(InfoExtractor): + def _get_token(self, force_refresh=False): + if not force_refresh: + maybe_cached = self.cache.load('wykop', 'bearer') + if maybe_cached: + return maybe_cached + + new_token = traverse_obj( + self._do_call_api('auth', None, 'Downloading anonymous auth token', data={ + # hardcoded in frontend + 'key': 'w53947240748', + 'secret': 'd537d9e0a7adc1510842059ae5316419', + }), ('data', 'token')) + + self.cache.store('wykop', 'bearer', new_token) + return new_token + + def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}): + if data: + data = json.dumps({'data': data}).encode() + headers['Content-Type'] = 'application/json' + + return self._download_json( + f'https://wykop.pl/api/v3/{path}', video_id, + note=note, data=data, headers=headers) + + def _call_api(self, path, video_id, note='Downloading JSON metadata'): + token = self._get_token() + for retrying in range(2): + try: + return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'}) + except ExtractorError as e: + if not retrying and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + token = self._get_token(True) + continue + raise + + def _common_data_extract(self, data): + author = traverse_obj(data, ('author', 'username'), expected_type=str) + + return { + '_type': 'url_transparent', + 'display_id': data.get('slug'), + 'url': traverse_obj(data, + ('media', 'embed', 'url'), # what gets an iframe embed + ('source', 'url'), # clickable url (dig only) + expected_type=url_or_none), + 'thumbnail': traverse_obj( + data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none), + 'uploader': author, + 'uploader_id': author, + 'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'), + 'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '), # time it got submitted + 'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int), + 'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int), + 'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int), + 'age_limit': 18 if data.get('adult') else 0, + 'tags': data.get('tags'), + } + + +class WykopDigIE(WykopBaseExtractor): + IE_NAME = 'wykop:dig' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth', + 'info_dict': { + 'id': 'rlSTBvViflc', + 'ext': 'mp4', + 'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth', + 'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth', + 'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87', + 'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'], + 'age_limit': 0, + 'timestamp': 1669154480, + 'release_timestamp': 1669194241, + 'release_date': '20221123', + 'uploader': 'starnak', + 'uploader_id': 'starnak', + 'uploader_url': 'https://wykop.pl/ludzie/starnak', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'view_count': int, + 'channel': 'BBC Earth', + 'channel_id': 'UCwmZiChSryoWQCZMIQezgTg', + 'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg', + 'categories': ['Pets & Animals'], + 'upload_date': '20220923', + 'duration': 191, + 'channel_follower_count': int, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + }, + }] + + @classmethod + def suitable(cls, url): + return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'links/{video_id}', video_id)['data'] + + return { + **self._common_data_extract(data), + 'id': video_id, + 'title': data['title'], + 'description': data.get('description'), + # time it got "digged" to the homepage + 'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '), + } + + +class WykopDigCommentIE(WykopBaseExtractor): + IE_NAME = 'wykop:dig:comment' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g', + 'info_dict': { + 'id': 'u6tEi2FmKZY', + 'ext': 'mp4', + 'title': 'md5:e7c741c5baa7ed6478000caf72865577', + 'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db', + 'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e', + 'timestamp': 1674476945, + 'uploader': 'Bartholomew', + 'uploader_id': 'Bartholomew', + 'uploader_url': 'https://wykop.pl/ludzie/Bartholomew', + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'tags': [], + 'availability': 'public', + 'duration': 1838, + 'upload_date': '20230117', + 'categories': ['Entertainment'], + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'age_limit': 0, + 'chapters': 'count:3', + 'channel': 'Poszukiwacze Okazji', + 'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw', + 'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw', + }, + }] + + def _real_extract(self, url): + dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id')) + data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data'] + + return { + **self._common_data_extract(data), + 'id': comment_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } + + +class WykopPostIE(WykopBaseExtractor): + IE_NAME = 'wykop:post' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek', + 'info_dict': { + 'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI', + 'title': 'PawelW124 - #kot #koty #smiesznykotek', + 'description': '#kot #koty #smiesznykotek', + 'display_id': 'kot-koty-smiesznykotek', + 'tags': ['kot', 'koty', 'smiesznykotek'], + 'uploader': 'PawelW124', + 'uploader_id': 'PawelW124', + 'uploader_url': 'https://wykop.pl/ludzie/PawelW124', + 'timestamp': 1668938142, + 'age_limit': 0, + 'like_count': int, + 'dislike_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'comment_count': int, + 'channel': 'Revan', + 'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw', + 'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw', + 'upload_date': '20221120', + 'modified_date': '20220814', + 'availability': 'public', + 'view_count': int, + }, + 'playlist_mincount': 15, + 'params': { + 'flat_playlist': True, + } + }] + + @classmethod + def suitable(cls, url): + return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'entries/{video_id}', video_id)['data'] + + return { + **self._common_data_extract(data), + 'id': video_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } + + +class WykopPostCommentIE(WykopBaseExtractor): + IE_NAME = 'wykop:post:comment' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979', + 'info_dict': { + 'id': 'confusedquickarmyant', + 'ext': 'mp4', + 'title': 'tpap - treść komentarza', + 'display_id': 'tresc-komentarza', + 'description': 'treść komentarza', + 'uploader': 'tpap', + 'uploader_id': 'tpap', + 'uploader_url': 'https://wykop.pl/ludzie/tpap', + 'timestamp': 1675349470, + 'upload_date': '20230202', + 'tags': [], + 'duration': 2.12, + 'age_limit': 0, + 'categories': [], + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + }, + }] + + def _real_extract(self, url): + post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id')) + data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data'] + + return { + **self._common_data_extract(data), + 'id': comment_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } diff --git a/plugin/yt-dlp/yt_dlp/extractor/ximalaya.py b/plugin/yt-dlp/yt_dlp/extractor/ximalaya.py index acc0871..3008948 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/ximalaya.py +++ b/plugin/yt-dlp/yt_dlp/extractor/ximalaya.py @@ -158,7 +158,7 @@ def _fetch_page(self, playlist_id, page_idx): return self._download_json( 'https://www.ximalaya.com/revision/album/v1/getTracksList', playlist_id, note=f'Downloading tracks list page {page_idx}', - query={'albumId': playlist_id, 'pageNum': page_idx, 'sort': 1})['data'] + query={'albumId': playlist_id, 'pageNum': page_idx})['data'] def _get_entries(self, page_data): for e in page_data['tracks']: diff --git a/plugin/yt-dlp/yt_dlp/extractor/yappy.py b/plugin/yt-dlp/yt_dlp/extractor/yappy.py index b6cc980..2374153 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/yappy.py +++ b/plugin/yt-dlp/yt_dlp/extractor/yappy.py @@ -1,9 +1,10 @@ from .common import InfoExtractor from ..utils import ( + OnDemandPagedList, int_or_none, traverse_obj, unified_timestamp, - url_or_none + url_or_none, ) @@ -97,3 +98,30 @@ def _real_extract(self, url): 'categories': traverse_obj(media_data, ('categories', ..., 'name')) or None, 'repost_count': int_or_none(media_data.get('sharingCount')) } + + +class YappyProfileIE(InfoExtractor): + _VALID_URL = r'https?://yappy\.media/profile/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://yappy.media/profile/59a0c8c485e5410b9c43474bf4c6a373', + 'info_dict': { + 'id': '59a0c8c485e5410b9c43474bf4c6a373', + }, + 'playlist_mincount': 527, + }] + + def _real_extract(self, url): + profile_id = self._match_id(url) + + def fetch_page(page_num): + page_num += 1 + videos = self._download_json( + f'https://yappy.media/api/video/list/{profile_id}?page={page_num}', + profile_id, f'Downloading profile page {page_num} JSON') + + for video in traverse_obj(videos, ('results', lambda _, v: v['uuid'])): + yield self.url_result( + f'https://yappy.media/video/{video["uuid"]}', YappyIE, + video['uuid'], video.get('description')) + + return self.playlist_result(OnDemandPagedList(fetch_page, 15), profile_id) diff --git a/plugin/yt-dlp/yt_dlp/extractor/youtube.py b/plugin/yt-dlp/yt_dlp/extractor/youtube.py index c8b4af6..ca969c5 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/youtube.py +++ b/plugin/yt-dlp/yt_dlp/extractor/youtube.py @@ -66,7 +66,6 @@ variadic, ) - STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { @@ -293,6 +292,7 @@ class BadgeType(enum.Enum): AVAILABILITY_PREMIUM = enum.auto() AVAILABILITY_SUBSCRIPTION = enum.auto() LIVE_NOW = enum.auto() + VERIFIED = enum.auto() class YoutubeBaseInfoExtractor(InfoExtractor): @@ -792,17 +792,26 @@ def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): def _extract_and_report_alerts(self, data, *args, **kwargs): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) - def _extract_badges(self, renderer: dict): - privacy_icon_map = { + def _extract_badges(self, badge_list: list): + """ + Extract known BadgeType's from a list of badge renderers. + @returns [{'type': BadgeType}] + """ + icon_type_map = { 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, - 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, + 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, + 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, + 'CHECK': BadgeType.VERIFIED, } badge_style_map = { 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, - 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, + 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, + 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED, } label_map = { @@ -810,13 +819,15 @@ def _extract_badges(self, renderer: dict): 'private': BadgeType.AVAILABILITY_PRIVATE, 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, 'live': BadgeType.LIVE_NOW, - 'premium': BadgeType.AVAILABILITY_PREMIUM + 'premium': BadgeType.AVAILABILITY_PREMIUM, + 'verified': BadgeType.VERIFIED, + 'official artist channel': BadgeType.VERIFIED, } badges = [] - for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer')): + for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): badge_type = ( - privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) or badge_style_map.get(traverse_obj(badge, 'style')) ) if badge_type: @@ -824,11 +835,12 @@ def _extract_badges(self, renderer: dict): continue # fallback, won't work in some languages - label = traverse_obj(badge, 'label', expected_type=str, default='') + label = traverse_obj( + badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') for match, label_badge_type in label_map.items(): if match in label.lower(): - badges.append({'type': badge_type}) - continue + badges.append({'type': label_badge_type}) + break return badges @@ -894,9 +906,16 @@ def _extract_thumbnails(data, *path_list): def extract_relative_time(relative_time_text): """ Extracts a relative time from string and converts to dt object - e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' """ - mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) + + # XXX: this could be moved to a general function in utils.py + # The relative time text strings are roughly the same as what + # Javascript's Intl.RelativeTimeFormat function generates. + # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat + mobj = re.search( + r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>sec(?:ond)?|s|min(?:ute)?|h(?:our|r)?|d(?:ay)?|w(?:eek|k)?|mo(?:nth)?|y(?:ear|r)?)s?\s*ago', + relative_time_text) if mobj: start = mobj.group('start') if start: @@ -1014,8 +1033,8 @@ def _extract_video(self, renderer): overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) - badges = self._extract_badges(renderer) - + badges = self._extract_badges(traverse_obj(renderer, 'badges')) + owner_badges = self._extract_badges(traverse_obj(renderer, 'ownerBadges')) navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -1039,6 +1058,13 @@ def _extract_video(self, renderer): else self._get_count({'simpleText': view_count_text})) view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count' + channel = (self._get_text(renderer, 'ownerText', 'shortBylineText') + or self._get_text(reel_header_renderer, 'channelTitleText')) + + channel_handle = traverse_obj(renderer, ( + 'shortBylineText', 'runs', ..., 'navigationEndpoint', + (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl'))), + expected_type=self.handle_from_url, get_all=False) return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -1048,9 +1074,11 @@ def _extract_video(self, renderer): 'description': description, 'duration': duration, 'channel_id': channel_id, - 'channel': (self._get_text(renderer, 'ownerText', 'shortBylineText') - or self._get_text(reel_header_renderer, 'channelTitleText')), + 'channel': channel, 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, + 'uploader': channel, + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), 'timestamp': (self._parse_time_text(time_text) if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) @@ -1064,7 +1092,8 @@ def _extract_video(self, renderer): needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), view_count_field: view_count, - 'live_status': live_status + 'live_status': live_status, + 'channel_is_verified': True if self._has_badge(owner_badges, BadgeType.VERIFIED) else None } @@ -1274,6 +1303,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', } }, { @@ -1316,6 +1346,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1399,6 +1430,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'The Witcher', 'uploader_url': 'https://www.youtube.com/@thewitcher', 'uploader_id': '@thewitcher', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, }, { @@ -1427,6 +1461,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'FlyingKitty', 'uploader_url': 'https://www.youtube.com/@FlyingKitty900', 'uploader_id': '@FlyingKitty900', + 'comment_count': int, + 'channel_is_verified': True, }, }, { @@ -1560,6 +1596,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Olympics', 'uploader_url': 'https://www.youtube.com/@Olympics', 'uploader_id': '@Olympics', + 'channel_is_verified': True, }, 'params': { 'skip_download': 'requires avconv', @@ -1877,6 +1914,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Bernie Sanders', 'uploader_url': 'https://www.youtube.com/@BernieSanders', 'uploader_id': '@BernieSanders', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1938,6 +1977,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Vsauce', 'uploader_url': 'https://www.youtube.com/@Vsauce', 'uploader_id': '@Vsauce', + 'comment_count': int, + 'channel_is_verified': True, }, 'params': { 'skip_download': True, @@ -2130,6 +2171,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'kudvenkat', 'uploader_url': 'https://www.youtube.com/@Csharp-video-tutorialsBlogspot', 'uploader_id': '@Csharp-video-tutorialsBlogspot', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -2210,6 +2253,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'CBS Mornings', 'uploader_url': 'https://www.youtube.com/@CBSMornings', 'uploader_id': '@CBSMornings', + 'comment_count': int, + 'channel_is_verified': True, } }, { @@ -2280,6 +2325,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'colinfurze', 'uploader_url': 'https://www.youtube.com/@colinfurze', 'uploader_id': '@colinfurze', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': { 'format': '17', # 3gp format available on android @@ -2325,6 +2373,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'SciShow', 'uploader_url': 'https://www.youtube.com/@SciShow', 'uploader_id': '@SciShow', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': {'format': 'mhtml', 'skip_download': True} }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -2353,6 +2404,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Leon Nguyen', 'uploader_url': 'https://www.youtube.com/@LeonNguyen', 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', } }, { # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date @@ -2381,6 +2433,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Leon Nguyen', 'uploader_url': 'https://www.youtube.com/@LeonNguyen', 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', }, 'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']} }, { @@ -2411,6 +2464,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Quackity', 'uploader_id': '@Quackity', 'uploader_url': 'https://www.youtube.com/@Quackity', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', } }, { # continuous livestream. Microformat upload date should be preferred. @@ -2577,6 +2633,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'MrBeast', 'uploader_url': 'https://www.youtube.com/@MrBeast', 'uploader_id': '@MrBeast', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, }, { @@ -2638,6 +2697,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'さなちゃんねる', 'uploader_url': 'https://www.youtube.com/@sana_natori', 'uploader_id': '@sana_natori', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, }, { @@ -2667,6 +2728,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': r're:^https?://.*\.webp', 'channel_url': 'https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A', 'playable_in_embed': True, + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': { 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, @@ -2703,6 +2767,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Christopher Sykes', 'uploader_url': 'https://www.youtube.com/@ChristopherSykesDocumentaries', 'uploader_id': '@ChristopherSykesDocumentaries', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -2994,17 +3059,14 @@ def _parse_sig_js(self, jscode): r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'("|\')signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') @@ -3248,42 +3310,65 @@ def _extract_chapters_from_engagement_panel(self, data, duration): chapter_time, chapter_title, duration) for contents in content_list)), []) + def _extract_heatmap_from_player_overlay(self, data): + content_list = traverse_obj(data, ( + 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar', + 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list})) + return next(filter(None, ( + traverse_obj(contents, (..., 'heatMarkerRenderer', { + 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}), + 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000}, + 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}), + })) for contents in content_list)), None) + def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: return - text = self._get_text(comment_renderer, 'contentText') + info = { + 'id': comment_id, + 'text': self._get_text(comment_renderer, 'contentText'), + 'like_count': self._get_count(comment_renderer, 'voteCount'), + 'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})), + 'author': self._get_text(comment_renderer, 'authorText'), + 'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})), + 'parent': parent or 'root', + } # Timestamp is an estimate calculated from the current time and time_text time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' timestamp = self._parse_time_text(time_text) - author = self._get_text(comment_renderer, 'authorText') - author_id = try_get(comment_renderer, - lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) + info.update({ + # FIXME: non-standard, but we need a way of showing that it is an estimate. + '_time_text': time_text, + 'timestamp': timestamp, + }) + + info['author_url'] = urljoin( + 'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', ( + ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))), + expected_type=str, get_all=False)) - votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'], - lambda x: x['likeCount']), str)) or 0 - author_thumbnail = try_get(comment_renderer, - lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str) + author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner') + if author_is_uploader is not None: + info['author_is_uploader'] = author_is_uploader - author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) - is_favorited = 'creatorHeart' in (try_get( - comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {}) - return { - 'id': comment_id, - 'text': text, - 'timestamp': timestamp, - 'time_text': time_text, - 'like_count': votes, - 'is_favorited': is_favorited, - 'author': author, - 'author_id': author_id, - 'author_thumbnail': author_thumbnail, - 'author_is_uploader': author_is_uploader, - 'parent': parent or 'root' - } + comment_abr = traverse_obj( + comment_renderer, ('actionsButtons', 'commentActionButtonsRenderer'), expected_type=dict) + if comment_abr is not None: + info['is_favorited'] = 'creatorHeart' in comment_abr + + badges = self._extract_badges([traverse_obj(comment_renderer, 'authorCommentBadge')]) + if self._has_badge(badges, BadgeType.VERIFIED): + info['author_is_verified'] = True + + is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') + if is_pinned: + info['is_pinned'] = True + + return info def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None): @@ -3296,7 +3381,7 @@ def extract_header(contents): expected_comment_count = self._get_count( comments_header_renderer, 'countText', 'commentsCount') - if expected_comment_count: + if expected_comment_count is not None: tracker['est_total'] = expected_comment_count self.to_screen(f'Downloading ~{expected_comment_count} comments') comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top @@ -3331,14 +3416,13 @@ def extract_thread(contents): comment = self._extract_comment(comment_renderer, parent) if not comment: continue - is_pinned = bool(traverse_obj(comment_renderer, 'pinnedCommentBadge')) comment_id = comment['id'] - if is_pinned: + if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. # See: https://github.com/yt-dlp/yt-dlp/issues/6290 if comment_id in tracker['seen_comment_ids']: - if comment_id in tracker['pinned_comment_ids'] and not is_pinned: + if comment_id in tracker['pinned_comment_ids'] and not comment.get('is_pinned'): # Pinned comments may appear a second time in newest first sort # See: https://github.com/yt-dlp/yt-dlp/issues/6712 continue @@ -3367,7 +3451,7 @@ def extract_thread(contents): if not tracker: tracker = dict( running_total=0, - est_total=0, + est_total=None, current_page_thread=0, total_parent_comments=0, total_reply_comments=0, @@ -3400,11 +3484,13 @@ def extract_thread(contents): continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id)) is_forced_continuation = True + continuation_items_path = ( + 'onResponseReceivedEndpoints', ..., ('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems') for page_num in itertools.count(0): if not continuation: break headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response)) - comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})" + comment_prog_str = f"({tracker['running_total']}/~{tracker['est_total']})" if page_num == 0: if is_first_continuation: note_prefix = 'Downloading comment section API JSON' @@ -3415,11 +3501,18 @@ def extract_thread(contents): note_prefix = '%sDownloading comment%s API JSON page %d %s' % ( ' ' if parent else '', ' replies' if parent else '', page_num, comment_prog_str) + + # Do a deep check for incomplete data as sometimes YouTube may return no comments for a continuation + # Ignore check if YouTube says the comment count is 0. + check_get_keys = None + if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): + check_get_keys = [[*continuation_items_path, ..., ( + 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] try: response = self._extract_response( item_id=None, query=continuation, ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, - check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None) + check_get_keys=check_get_keys) except ExtractorError as e: # Ignore incomplete data error for replies if retries didn't work. # This is to allow any other parent comments and comment threads to be downloaded. @@ -3431,15 +3524,8 @@ def extract_thread(contents): else: raise is_forced_continuation = False - continuation_contents = traverse_obj( - response, 'onResponseReceivedEndpoints', expected_type=list, default=[]) - continuation = None - for continuation_section in continuation_contents: - continuation_items = traverse_obj( - continuation_section, - (('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'), - get_all=False, expected_type=list) or [] + for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): if is_first_continuation: continuation = extract_header(continuation_items) is_first_continuation = False @@ -3513,7 +3599,7 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - _STORY_PLAYER_PARAMS = '8AEB' + _PLAYER_PARAMS = 'CgIQBg==' def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): @@ -3527,7 +3613,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, 'videoId': video_id, } if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android': - yt_query['params'] = self._STORY_PLAYER_PARAMS + yt_query['params'] = self._PLAYER_PARAMS yt_query.update(self._generate_player_context(sts)) return self._extract_response( @@ -3762,6 +3848,8 @@ def build_fragments(f): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) + name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' + fps = int_or_none(fmt.get('fps')) or 0 dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -3769,16 +3857,16 @@ def build_fragments(f): 'format_note': join_nonempty( join_nonempty(audio_track.get('displayName'), language_preference > 0 and ' (default)', delim=''), - fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), - fmt.get('isDrc') and 'DRC', + name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), throttled and 'THROTTLED', is_damaged and 'DAMAGED', (self.get_param('verbose') or all_formats) and client_name, delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 - 'source_preference': -10 if throttled else -5 if itag == '22' else -1, - 'fps': int_or_none(fmt.get('fps')) or None, + 'source_preference': ((-10 if throttled else -5 if itag == '22' else -1) + + (100 if 'Premium' in name else 0)), + 'fps': fps if fps > 1 else None, # For some formats, fps is wrongly returned as 1 'audio_channels': fmt.get('audioChannels'), 'height': height, 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, @@ -3847,8 +3935,10 @@ def process_manifest_format(f, proto, client_name, itag): f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) - if self.get_param('verbose'): + if self.get_param('verbose') or all_formats: f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') + if f.get('fps') and f['fps'] <= 1: + del f['fps'] return True subtitles = {} @@ -3921,8 +4011,8 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} - if smuggled_data.get('is_story'): - query['pp'] = self._STORY_PLAYER_PARAMS + if smuggled_data.get('is_story'): # XXX: Deprecated + query['pp'] = self._PLAYER_PARAMS webpage = self._download_webpage( webpage_url, video_id, fatal=False, query=query) @@ -4226,9 +4316,13 @@ def process_language(container, base_url, lang_code, sub_name, query): continue trans_code += f'-{lang_code}' trans_name += format_field(lang_name, None, ' from %s') - # Add an "-orig" label to the original language so that it can be distinguished. - # The subs are returned without "-orig" as well for compatibility if lang_code == f'a-{orig_trans_code}': + # Set audio language based on original subtitles + for f in formats: + if f.get('acodec') != 'none' and not f.get('language'): + f['language'] = orig_trans_code + # Add an "-orig" label to the original language so that it can be distinguished. + # The subs are returned without "-orig" as well for compatibility process_language( automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) # Setting tlang=lang returns damaged subtitles. @@ -4317,6 +4411,8 @@ def process_language(container, base_url, lang_code, sub_name, query): or self._extract_chapters_from_description(video_description, duration) or None) + info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data) + contents = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), expected_type=list, default=[]) @@ -4415,6 +4511,9 @@ def process_language(container, base_url, lang_code, sub_name, query): info['artist'] = mrr_contents_text elif mrr_title == 'Song': info['track'] = mrr_contents_text + owner_badges = self._extract_badges(traverse_obj(vsir, ('owner', 'videoOwnerRenderer', 'badges'))) + if self._has_badge(owner_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True info.update({ 'uploader': info.get('channel'), @@ -4432,7 +4531,7 @@ def process_language(container, base_url, lang_code, sub_name, query): and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) ): upload_date = strftime_or_none( - self._parse_time_text(self._get_text(vpir, 'dateText')), '%Y%m%d') or upload_date + self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date info['upload_date'] = upload_date for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: @@ -4440,7 +4539,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if v: info[d_k] = v - badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False)) + badges = self._extract_badges(traverse_obj(vpir, 'badges')) is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or get_first(video_details, 'isPrivate', expected_type=bool)) @@ -4513,13 +4612,14 @@ def _extract_channel_renderer(self, renderer): channel_id = self.ucid_or_none(renderer['channelId']) title = self._get_text(renderer, 'title') channel_url = format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None) - # As of 2023-03-01 YouTube doesn't use the channel handles on these renderers yet. - # However we can expect them to change that in the future. channel_handle = self.handle_from_url( traverse_obj(renderer, ( 'navigationEndpoint', (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl')), {str}), get_all=False)) + if not channel_handle: + # As of 2023-06-01, YouTube sets subscriberCountText to the handle in search + channel_handle = self.handle_or_none(self._get_text(renderer, 'subscriberCountText')) return { '_type': 'url', 'url': channel_url, @@ -4532,10 +4632,18 @@ def _extract_channel_renderer(self, renderer): 'title': title, 'uploader_id': channel_handle, 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), - 'channel_follower_count': self._get_count(renderer, 'subscriberCountText'), + # See above. YouTube sets videoCountText to the subscriber text in search channel renderers. + # However, in feed/channels this is set correctly to the subscriber count + 'channel_follower_count': traverse_obj( + renderer, 'subscriberCountText', 'videoCountText', expected_type=self._get_count), 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), - 'playlist_count': self._get_count(renderer, 'videoCountText'), + 'playlist_count': ( + # videoCountText may be the subscriber count + self._get_count(renderer, 'videoCountText') + if self._get_count(renderer, 'subscriberCountText') is not None else None), 'description': self._get_text(renderer, 'descriptionSnippet'), + 'channel_is_verified': True if self._has_badge( + self._extract_badges(traverse_obj(renderer, 'ownerBadges')), BadgeType.VERIFIED) else None, } def _grid_entries(self, grid_renderer): @@ -4579,8 +4687,11 @@ def _grid_entries(self, grid_renderer): def _music_reponsive_list_entry(self, renderer): video_id = traverse_obj(renderer, ('playlistItemData', 'videoId')) if video_id: + title = traverse_obj(renderer, ( + 'flexColumns', 0, 'musicResponsiveListItemFlexColumnRenderer', + 'text', 'runs', 0, 'text')) return self.url_result(f'https://music.youtube.com/watch?v={video_id}', - ie=YoutubeIE.ie_key(), video_id=video_id) + ie=YoutubeIE.ie_key(), video_id=video_id, title=title) playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId')) if playlist_id: video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId')) @@ -4639,11 +4750,19 @@ def _playlist_entries(self, video_list_renderer): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( - rich_grid_renderer, ('content', ('videoRenderer', 'reelItemRenderer')), get_all=False) or {} + rich_grid_renderer, + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} video_id = renderer.get('videoId') - if not video_id: + if video_id: + yield self._extract_video(renderer) + return + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=self._get_text(renderer, 'title')) return - yield self._extract_video(renderer) def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') @@ -4872,7 +4991,7 @@ def _extract_metadata_from_tabs(self, item_id, data): metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict) if metadata_renderer: channel_id = traverse_obj(metadata_renderer, ('externalId', {self.ucid_or_none}), - ('channelUrl', {self.ucid_from_url})) + ('channelUrl', {self.ucid_from_url})) info.update({ 'channel': metadata_renderer.get('title'), 'channel_id': channel_id, @@ -4940,6 +5059,10 @@ def _get_uncropped(url): 'uploader_id': channel_handle, 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), }) + + channel_badges = self._extract_badges(traverse_obj(data, ('header', ..., 'badges'), get_all=False)) + if self._has_badge(channel_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True # Playlist stats is a text runs array containing [video count, view count, last updated]. # last updated or (view count and last updated) may be missing. playlist_stats = get_first( @@ -4948,7 +5071,7 @@ def _get_uncropped(url): last_updated_unix = self._parse_time_text( self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text'))) - info['modified_date'] = strftime_or_none(last_updated_unix, '%Y%m%d') + info['modified_date'] = strftime_or_none(last_updated_unix) info['view_count'] = self._get_count(playlist_stats, 1) if info['view_count'] is None: # 0 is allowed @@ -5048,7 +5171,7 @@ def _extract_availability(self, data): playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} player_header_privacy = playlist_header_renderer.get('privacy') - badges = self._extract_badges(sidebar_renderer) + badges = self._extract_badges(traverse_obj(sidebar_renderer, 'badges')) # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge privacy_setting_icon = get_first( @@ -5298,7 +5421,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader': '3Blue1Brown', 'tags': ['Mathematics'], - 'channel_follower_count': int + 'channel_follower_count': int, + 'channel_is_verified': True, }, }, { 'note': 'playlists, singlepage', @@ -5475,6 +5599,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', + 'channel_is_verified': True, }, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', @@ -5638,7 +5763,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'AlTsmyW4auo', # This will keep changing + 'id': 'hGkQjiJLjWQ', # This will keep changing 'ext': 'mp4', 'title': str, 'upload_date': r're:\d{8}', @@ -5662,6 +5787,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@SkyNews', 'uploader_id': '@SkyNews', 'uploader': 'Sky News', + 'channel_is_verified': True, }, 'params': { 'skip_download': True, @@ -5829,7 +5955,25 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@colethedj1894', 'uploader': 'colethedj', }, + 'playlist': [{ + 'info_dict': { + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'id': 'BaW_jenozKc', + '_type': 'url', + 'ie_key': 'Youtube', + 'duration': 10, + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': 'https://www.youtube.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', + 'view_count': int, + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc', + 'channel': 'Philipp Hagemeister', + 'uploader_id': '@PhilippHagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader': 'Philipp Hagemeister', + } + }], 'playlist_count': 1, + 'params': {'extract_flat': True}, }, { 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', 'url': 'https://www.youtube.com/feed/recommended', @@ -6130,6 +6274,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': str, 'concurrent_view_count': int, 'channel': str, + 'uploader': str, + 'uploader_url': str, + 'uploader_id': str, + 'channel_is_verified': bool, # this will keep changing } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -6165,6 +6313,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': 'PewDiePie', 'uploader_url': 'https://www.youtube.com/@PewDiePie', 'uploader_id': '@PewDiePie', + 'channel_is_verified': True, } }], 'params': {'extract_flat': True}, @@ -6183,8 +6332,44 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', + 'channel_is_verified': True, }, 'playlist_count': 0, + }, { + # Podcasts tab, with rich entry playlistRenderers + 'url': 'https://www.youtube.com/@99percentinvisiblepodcast/podcasts', + 'info_dict': { + 'id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'channel_id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'uploader_url': 'https://www.youtube.com/@99percentinvisiblepodcast', + 'description': 'md5:3a0ed38f1ad42a68ef0428c04a15695c', + 'title': '99 Percent Invisible - Podcasts', + 'uploader': '99 Percent Invisible', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'tags': [], + 'channel': '99 Percent Invisible', + 'uploader_id': '@99percentinvisiblepodcast', + }, + 'playlist_count': 1, + }, { + # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + 'url': 'https://www.youtube.com/@AHimitsu/releases', + 'info_dict': { + 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'channel': 'A Himitsu', + 'uploader_url': 'https://www.youtube.com/@AHimitsu', + 'title': 'A Himitsu - Releases', + 'uploader_id': '@AHimitsu', + 'uploader': 'A Himitsu', + 'channel_id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'tags': 'count:16', + 'description': 'I make music', + 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', + 'channel_follower_count': int, + 'channel_is_verified': True, + }, + 'playlist_mincount': 10, }] @classmethod @@ -6758,12 +6943,15 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc', 'title': 'Kurzgesagt – In a Nutshell', 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q', - 'playlist_count': int, # XXX: should have a way of saying > 1 + # No longer available for search as it is set to the handle. + # 'playlist_count': int, 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', 'thumbnails': list, 'uploader_id': '@kurzgesagt', 'uploader_url': 'https://www.youtube.com/@kurzgesagt', 'uploader': 'Kurzgesagt – In a Nutshell', + 'channel_is_verified': True, + 'channel_follower_count': int, } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -7027,6 +7215,8 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'live_status': 'not_live', 'channel_follower_count': int, 'chapters': 'count:20', + 'comment_count': int, + 'heatmap': 'count:100', } }] @@ -7087,6 +7277,8 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): 'channel': 'さなちゃんねる', 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', 'uploader': 'さなちゃんねる', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'add_ie': ['Youtube'], 'params': {'skip_download': 'Youtube'}, diff --git a/plugin/yt-dlp/yt_dlp/extractor/zaiko.py b/plugin/yt-dlp/yt_dlp/extractor/zaiko.py new file mode 100644 index 0000000..84cee44 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/extractor/zaiko.py @@ -0,0 +1,130 @@ +import base64 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + str_or_none, + traverse_obj, + try_call, + unescapeHTML, + url_or_none, +) + + +class ZaikoBaseIE(InfoExtractor): + def _download_real_webpage(self, url, video_id): + webpage, urlh = self._download_webpage_handle(url, video_id) + final_url = urlh.geturl() + if 'zaiko.io/login' in final_url: + self.raise_login_required() + elif '/_buy/' in final_url: + raise ExtractorError('Your account does not have tickets to this event', expected=True) + return webpage + + def _parse_vue_element_attr(self, name, string, video_id): + page_elem = self._search_regex(rf'(<{name}[^>]+>)', string, name) + attrs = {} + for key, value in extract_attributes(page_elem).items(): + if key.startswith(':'): + attrs[key[1:]] = self._parse_json( + value, video_id, transform_source=unescapeHTML, fatal=False) + return attrs + + +class ZaikoIE(ZaikoBaseIE): + _VALID_URL = r'https?://(?:[\w-]+\.)?zaiko\.io/event/(?P<id>\d+)/stream(?:/\d+)+' + _TESTS = [{ + 'url': 'https://zaiko.io/event/324868/stream/20571/20571', + 'info_dict': { + 'id': '324868', + 'ext': 'mp4', + 'title': 'ZAIKO STREAMING TEST', + 'alt_title': '[VOD] ZAIKO STREAMING TEST_20210603(Do Not Delete)', + 'uploader_id': '454', + 'uploader': 'ZAIKO ZERO', + 'release_timestamp': 1583809200, + 'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+', + 'release_date': '20200310', + 'categories': ['Tech House'], + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_real_webpage(url, video_id) + stream_meta = self._parse_vue_element_attr('stream-page', webpage, video_id) + + player_page = self._download_webpage( + stream_meta['stream-access']['video_source'], video_id, + 'Downloading player page', headers={'referer': 'https://zaiko.io/'}) + player_meta = self._parse_vue_element_attr('player', player_page, video_id) + status = traverse_obj(player_meta, ('initial_event_info', 'status', {str})) + live_status, msg, expected = { + 'vod': ('was_live', 'No VOD stream URL was found', False), + 'archiving': ('post_live', 'Event VOD is still being processed', True), + 'deleting': ('post_live', 'This event has ended', True), + 'deleted': ('post_live', 'This event has ended', True), + 'error': ('post_live', 'This event has ended', True), + 'disconnected': ('post_live', 'Stream has been disconnected', True), + 'live_to_disconnected': ('post_live', 'Stream has been disconnected', True), + 'live': ('is_live', 'No livestream URL found was found', False), + 'waiting': ('is_upcoming', 'Live event has not yet started', True), + 'cancelled': ('not_live', 'Event has been cancelled', True), + }.get(status) or ('not_live', f'Unknown event status "{status}"', False) + + stream_url = traverse_obj(player_meta, ('initial_event_info', 'endpoint', {url_or_none})) + formats = self._extract_m3u8_formats( + stream_url, video_id, live=True, fatal=False) if stream_url else [] + if not formats: + self.raise_no_formats(msg, expected=expected) + + return { + 'id': video_id, + 'formats': formats, + 'live_status': live_status, + **traverse_obj(stream_meta, { + 'title': ('event', 'name', {str}), + 'uploader': ('profile', 'name', {str}), + 'uploader_id': ('profile', 'id', {str_or_none}), + 'release_timestamp': ('stream', 'start', 'timestamp', {int_or_none}), + 'categories': ('event', 'genres', ..., {lambda x: x or None}), + }), + **traverse_obj(player_meta, ('initial_event_info', { + 'alt_title': ('title', {str}), + 'thumbnail': ('poster_url', {url_or_none}), + })), + } + + +class ZaikoETicketIE(ZaikoBaseIE): + _VALID_URL = r'https?://(?:www.)?zaiko\.io/account/eticket/(?P<id>[\w=-]{49})' + _TESTS = [{ + 'url': 'https://zaiko.io/account/eticket/TZjMwMzQ2Y2EzMXwyMDIzMDYwNzEyMTMyNXw1MDViOWU2Mw==', + 'playlist_count': 1, + 'info_dict': { + 'id': 'f30346ca31-20230607121325-505b9e63', + 'title': 'ZAIKO STREAMING TEST', + 'thumbnail': 'https://media.zkocdn.net/pf_1/1_3wdyjcjyupseatkwid34u', + }, + 'skip': 'Only available with the ticketholding account', + }] + + def _real_extract(self, url): + ticket_id = self._match_id(url) + ticket_id = try_call( + lambda: base64.urlsafe_b64decode(ticket_id[1:]).decode().replace('|', '-')) or ticket_id + + webpage = self._download_real_webpage(url, ticket_id) + eticket = self._parse_vue_element_attr('eticket', webpage, ticket_id) + + return self.playlist_result( + [self.url_result(stream, ZaikoIE) for stream in traverse_obj(eticket, ('streams', ..., 'url'))], + ticket_id, **traverse_obj(eticket, ('ticket-details', { + 'title': 'event_name', + 'thumbnail': 'event_img_url', + }))) diff --git a/plugin/yt-dlp/yt_dlp/extractor/zdf.py b/plugin/yt-dlp/yt_dlp/extractor/zdf.py index 1de1bd4..9f8f6d9 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/zdf.py +++ b/plugin/yt-dlp/yt_dlp/extractor/zdf.py @@ -24,7 +24,7 @@ class ZDFBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'uhd') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd') def _call_api(self, url, video_id, item, api_token=None, referrer=None): headers = {} @@ -61,6 +61,9 @@ def _extract_format(self, video_id, formats, format_urls, meta): elif mime_type == 'application/f4m+xml' or ext == 'f4m': new_formats = self._extract_f4m_formats( update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) + elif ext == 'mpd': + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) else: f = parse_codecs(meta.get('mimeCodec')) if not f and meta.get('type'): diff --git a/plugin/yt-dlp/yt_dlp/extractor/zee5.py b/plugin/yt-dlp/yt_dlp/extractor/zee5.py index 02c29f5..31298ed 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/zee5.py +++ b/plugin/yt-dlp/yt_dlp/extractor/zee5.py @@ -1,14 +1,16 @@ import json -import random -import string +import time +import uuid from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + jwt_decode_hs256, parse_age_limit, str_or_none, + try_call, try_get, unified_strdate, unified_timestamp, @@ -94,12 +96,12 @@ class Zee5IE(InfoExtractor): 'url': 'https://www.zee5.com/music-videos/details/adhento-gaani-vunnapaatuga-jersey-nani-shraddha-srinath/0-0-56973', 'only_matching': True }] - _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails/secure?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' - _DEVICE_ID = ''.join(random.choices(string.ascii_letters + string.digits, k=20)).ljust(32, '0') + _DEVICE_ID = str(uuid.uuid4()) _USER_TOKEN = None _LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.' _NETRC_MACHINE = 'zee5' _GEO_COUNTRIES = ['IN'] + _USER_COUNTRY = None def _perform_login(self, username, password): if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: @@ -118,11 +120,16 @@ def _perform_login(self, username, password): self._USER_TOKEN = otp_verify_json.get('token') if not self._USER_TOKEN: raise ExtractorError(otp_request_json['message'], expected=True) - elif username.lower() == 'token' and len(password) > 1198: + elif username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): self._USER_TOKEN = password else: raise ExtractorError(self._LOGIN_HINT, expected=True) + token = jwt_decode_hs256(self._USER_TOKEN) + if token.get('exp', 0) <= int(time.time()): + raise ExtractorError('User token has expired', expected=True) + self._USER_COUNTRY = token.get('current_country') + def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') access_token_request = self._download_json( @@ -137,8 +144,13 @@ def _real_extract(self, url): data['X-Z5-Guest-Token'] = self._DEVICE_ID json_data = self._download_json( - self._DETAIL_API_URL.format(video_id, self._DEVICE_ID), - video_id, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) + 'https://spapi.zee5.com/singlePlayback/getDetails/secure', video_id, query={ + 'content_id': video_id, + 'device_id': self._DEVICE_ID, + 'platform_name': 'desktop_web', + 'country': self._USER_COUNTRY or self.get_param('geo_bypass_country') or 'IN', + 'check_parental_control': False, + }, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) asset_data = json_data['assetDetails'] show_data = json_data.get('showDetails', {}) if 'premium' in asset_data['business_type']: diff --git a/plugin/yt-dlp/yt_dlp/extractor/zingmp3.py b/plugin/yt-dlp/yt_dlp/extractor/zingmp3.py index d80ec99..082cf18 100644 --- a/plugin/yt-dlp/yt_dlp/extractor/zingmp3.py +++ b/plugin/yt-dlp/yt_dlp/extractor/zingmp3.py @@ -1,16 +1,11 @@ -import functools import hashlib import hmac +import itertools import json import urllib.parse from .common import InfoExtractor -from ..utils import ( - OnDemandPagedList, - int_or_none, - traverse_obj, - urljoin, -) +from ..utils import int_or_none, traverse_obj, try_call, urljoin class ZingMp3BaseIE(InfoExtractor): @@ -37,6 +32,7 @@ class ZingMp3BaseIE(InfoExtractor): 'info-artist': '/api/v2/page/get/artist', 'user-list-song': '/api/v2/song/get/list', 'user-list-video': '/api/v2/video/get/list', + 'hub': '/api/v2/page/get/hub-detail', } def _api_url(self, url_type, params): @@ -46,9 +42,9 @@ def _api_url(self, url_type, params): ''.join(f'{k}={v}' for k, v in sorted(params.items())).encode()).hexdigest() data = { **params, - 'apiKey': '88265e23d4284f25963e6eedac8fbfa3', - 'sig': hmac.new( - b'2aa2d1c561e809b267f3638c4a307aab', f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(), + 'apiKey': 'X5BM3w8N7MKozC0B85o4KMlzLZKhV00y', + 'sig': hmac.new(b'acOrvUS15XRW2o9JksiK1KgQ6Vbds8ZW', + f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(), } return f'{self._DOMAIN}{api_slug}?{urllib.parse.urlencode(data)}' @@ -67,6 +63,19 @@ def _parse_items(self, items): for url in traverse_obj(items, (..., 'link')) or []: yield self.url_result(urljoin(self._DOMAIN, url)) + def _fetch_page(self, id_, url_type, page): + raise NotImplementedError('This method must be implemented by subclasses') + + def _paged_list(self, _id, url_type): + count = 0 + for page in itertools.count(1): + data = self._fetch_page(_id, url_type, page) + entries = list(self._parse_items(data.get('items'))) + count += len(entries) + yield from entries + if not data.get('hasMore') or try_call(lambda: count > data['total']): + break + class ZingMp3IE(ZingMp3BaseIE): _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed' @@ -166,8 +175,11 @@ def _real_extract(self, url): 'height': int_or_none(res), }) - if not formats and item.get('msg') == 'Sorry, this content is not available in your country.': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + if not formats: + if item.get('msg') == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + else: + self.raise_no_formats('The song is only for VIP accounts.') lyric = item.get('lyric') or self._call_api('lyric', {'id': item_id}, fatal=False).get('file') @@ -200,7 +212,7 @@ class ZingMp3AlbumIE(ZingMp3BaseIE): 'id': 'ZWZAEZZD', 'title': 'Những Bài Hát Hay Nhất Của Mr. Siro', }, - 'playlist_mincount': 49, + 'playlist_mincount': 20, }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, @@ -305,22 +317,20 @@ class ZingMp3ChartMusicVideoIE(ZingMp3BaseIE): 'id': 'IWZ9Z086', 'title': 'the-loai-video_Khong-Loi', }, - 'playlist_mincount': 10, + 'playlist_mincount': 1, }] def _fetch_page(self, song_id, url_type, page): - return self._parse_items(self._call_api(url_type, { + return self._call_api(url_type, { 'id': song_id, 'type': 'genre', - 'page': page + 1, + 'page': page, 'count': self._PER_PAGE - }).get('items')) + }) def _real_extract(self, url): song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type') - return self.playlist_result( - OnDemandPagedList(functools.partial(self._fetch_page, song_id, url_type), self._PER_PAGE), - song_id, f'{url_type}_{regions}') + return self.playlist_result(self._paged_list(song_id, url_type), song_id, f'{url_type}_{regions}') class ZingMp3UserIE(ZingMp3BaseIE): @@ -331,7 +341,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - bai-hat', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 91, }, { @@ -339,7 +349,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - album', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 3, }, { @@ -347,7 +357,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - single', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 20, }, { @@ -355,19 +365,19 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - video', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 15, }] def _fetch_page(self, user_id, url_type, page): url_type = 'user-list-song' if url_type == 'bai-hat' else 'user-list-video' - return self._parse_items(self._call_api(url_type, { + return self._call_api(url_type, { 'id': user_id, 'type': 'artist', - 'page': page + 1, + 'page': page, 'count': self._PER_PAGE - }, query={'sort': 'new', 'sectionId': 'aSong'}).get('items')) + }) def _real_extract(self, url): user_alias, url_type = self._match_valid_url(url).group('user', 'type') @@ -376,10 +386,41 @@ def _real_extract(self, url): user_info = self._call_api('info-artist', {}, user_alias, query={'alias': user_alias}) if url_type in ('bai-hat', 'video'): - entries = OnDemandPagedList( - functools.partial(self._fetch_page, user_info['id'], url_type), self._PER_PAGE) + entries = self._paged_list(user_info['id'], url_type) else: entries = self._parse_items(traverse_obj(user_info, ( - 'sections', lambda _, v: v['link'] == f'/{user_alias}/{url_type}', 'items', ...))) + 'sections', + lambda _, v: v['sectionId'] == 'aAlbum' if url_type == 'album' else v['sectionId'] == 'aSingle', + 'items', ...))) return self.playlist_result( entries, user_info['id'], f'{user_info.get("name")} - {url_type}', user_info.get('biography')) + + +class ZingMp3HubIE(ZingMp3BaseIE): + IE_NAME = 'zingmp3:hub' + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>hub)/(?P<regions>[^/]+)/(?P<id>[^\.]+)' + _TESTS = [{ + 'url': 'https://zingmp3.vn/hub/Nhac-Moi/IWZ9Z0CA.html', + 'info_dict': { + 'id': 'IWZ9Z0CA', + 'title': 'Nhạc Mới', + 'description': 'md5:1cc31b68a6f746427b07b2756c22a558', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://zingmp3.vn/hub/Nhac-Viet/IWZ9Z087.html', + 'info_dict': { + 'id': 'IWZ9Z087', + 'title': 'Nhạc Việt', + 'description': 'md5:acc976c8bdde64d5c6ee4a92c39f7a77', + }, + 'playlist_mincount': 30, + }] + + def _real_extract(self, url): + song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type') + hub_detail = self._call_api(url_type, {'id': song_id}) + entries = self._parse_items(traverse_obj(hub_detail, ( + 'sections', lambda _, v: v['sectionId'] == 'hub', 'items', ...))) + return self.playlist_result( + entries, song_id, hub_detail.get('title'), hub_detail.get('description')) diff --git a/plugin/yt-dlp/yt_dlp/jsinterp.py b/plugin/yt-dlp/yt_dlp/jsinterp.py index 8d89454..91f0db9 100644 --- a/plugin/yt-dlp/yt_dlp/jsinterp.py +++ b/plugin/yt-dlp/yt_dlp/jsinterp.py @@ -20,7 +20,12 @@ def _js_bit_op(op): def zeroise(x): - return 0 if x in (None, JS_Undefined) else x + if x in (None, JS_Undefined): + return 0 + with contextlib.suppress(TypeError): + if math.isnan(x): # NB: NaN cannot be checked by membership + return 0 + return x def wrapped(a, b): return op(zeroise(a), zeroise(b)) & 0xffffffff @@ -39,7 +44,7 @@ def wrapped(a, b): def _js_div(a, b): - if JS_Undefined in (a, b) or not (a and b): + if JS_Undefined in (a, b) or not (a or b): return float('nan') return (a or 0) / b if b else float('inf') @@ -243,7 +248,7 @@ def _separate(expr, delim=',', max_split=None): return counters = {k: 0 for k in _MATCHING_PARENS.values()} start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 - in_quote, escaping, after_op, in_regex_char_group, in_unary_op = None, False, True, False, False + in_quote, escaping, after_op, in_regex_char_group = None, False, True, False for idx, char in enumerate(expr): if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 @@ -347,8 +352,10 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': flags, outer = self._regex_flags(outer) + # We don't support regex methods yet, so no point compiling it + inner = f'{inner}/{flags}' # Avoid https://github.com/python/cpython/issues/74534 - inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags) + # inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags) else: inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) if not outer: @@ -438,7 +445,7 @@ def dict_item(key, val): err = e pending = (None, False) - m = re.match(r'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr) + m = re.match(fr'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{', expr) if m: sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) if err: @@ -772,7 +779,7 @@ def extract_object(self, objname): obj = {} obj_m = re.search( r'''(?x) - (?<!this\.)%s\s*=\s*{\s* + (?<!\.)%s\s*=\s*{\s* (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) }\s*; ''' % (re.escape(objname), _FUNC_NAME_RE), @@ -805,9 +812,9 @@ def extract_function_code(self, funcname): \((?P<args>[^)]*)\)\s* (?P<code>{.+})''' % {'name': re.escape(funcname)}, self.code) - code, _ = self._separate_at_paren(func_m.group('code')) if func_m is None: raise self.Exception(f'Could not find JS function "{funcname}"') + code, _ = self._separate_at_paren(func_m.group('code')) return [x.strip() for x in func_m.group('args').split(',')], code def extract_function(self, funcname): diff --git a/plugin/yt-dlp/yt_dlp/options.py b/plugin/yt-dlp/yt_dlp/options.py index ccbe766..9829e72 100644 --- a/plugin/yt-dlp/yt_dlp/options.py +++ b/plugin/yt-dlp/yt_dlp/options.py @@ -34,6 +34,7 @@ join_nonempty, orderedSet_from_options, remove_end, + variadic, write_string, ) from .version import CHANNEL, __version__ @@ -250,7 +251,7 @@ def _dict_from_options_callback( if multiple_args: val = [val, *value[1:]] elif default_key is not None: - keys, val = [default_key], value + keys, val = variadic(default_key), value else: raise optparse.OptionValueError( f'wrong {opt_str} formatting; it should be {option.metavar}, not "{value}"') @@ -323,7 +324,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Print program version and exit') general.add_option( '-U', '--update', - action='store_true', dest='update_self', + action='store_const', dest='update_self', const=CHANNEL, help=format_field( is_non_updateable(), None, 'Check if updates are available. %s', default=f'Update this program to the latest {CHANNEL} version')) @@ -335,9 +336,9 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--update-to', action='store', dest='update_self', metavar='[CHANNEL]@[TAG]', help=( - 'Upgrade/downgrade to a specific version. CHANNEL and TAG defaults to ' - f'"{CHANNEL}" and "latest" respectively if omitted; See "UPDATE" for details. ' - f'Supported channels: {", ".join(UPDATE_SOURCES)}')) + 'Upgrade/downgrade to a specific version. CHANNEL can be a repository as well. ' + f'CHANNEL and TAG default to "{CHANNEL.partition("@")[0]}" and "latest" respectively if omitted; ' + f'See "UPDATE" for details. Supported channels: {", ".join(UPDATE_SOURCES)}')) general.add_option( '-i', '--ignore-errors', action='store_true', dest='ignoreerrors', @@ -411,7 +412,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): general.add_option( '--no-flat-playlist', action='store_false', dest='extract_flat', - help='Extract the videos of a playlist') + help='Fully extract the videos of a playlist (default)') general.add_option( '--live-from-start', action='store_true', dest='live_from_start', @@ -440,8 +441,25 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not mark videos watched (default)') general.add_option( '--no-colors', '--no-colours', - action='store_true', dest='no_color', default=False, - help='Do not emit color codes in output (Alias: --no-colours)') + action='store_const', dest='color', const={ + 'stdout': 'no_color', + 'stderr': 'no_color', + }, + help=optparse.SUPPRESS_HELP) + general.add_option( + '--color', + dest='color', metavar='[STREAM:]POLICY', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': 'stdout|stderr', + 'default_key': ['stdout', 'stderr'], + 'process': str.strip, + }, help=( + 'Whether to emit color codes in output, optionally prefixed by ' + 'the STREAM (stdout or stderr) to apply the setting to. ' + 'Can be one of "always", "auto" (default), "never", or ' + '"no_color" (use non color terminal sequences). ' + 'Can be used multiple times')) general.add_option( '--compat-options', metavar='OPTS', dest='compat_opts', default=set(), type='str', @@ -449,15 +467,15 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): callback_kwargs={ 'allowed_values': { 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', - 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', + 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', 'playlist-match-filter', 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress', 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', }, 'aliases': { - 'youtube-dl': ['all', '-multistreams'], - 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], + 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter'], + 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter'], '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], - '2022': ['no-external-downloader-progress'], + '2022': ['no-external-downloader-progress', 'playlist-match-filter'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' @@ -521,11 +539,11 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=optparse.SUPPRESS_HELP) geo.add_option( '--xff', metavar='VALUE', - dest='geo_bypass', default="default", + dest='geo_bypass', default='default', help=( 'How to fake X-Forwarded-For HTTP header to try bypassing geographic restriction. ' - 'One of "default" (Only when known to be useful), "never", ' - 'a two-letter ISO 3166-2 country code, or an IP block in CIDR notation')) + 'One of "default" (only when known to be useful), "never", ' + 'an IP block in CIDR notation, or a two-letter ISO 3166-2 country code')) geo.add_option( '--geo-bypass', action='store_const', dest='geo_bypass', const='default', @@ -617,7 +635,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'that contains the phrase "cats & dogs" (caseless). ' 'Use "--match-filter -" to interactively ask whether to download each video')) selection.add_option( - '--no-match-filter', + '--no-match-filters', dest='match_filter', action='store_const', const=None, help='Do not use any --match-filter (default)') selection.add_option( @@ -702,6 +720,10 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--netrc-location', dest='netrc_location', metavar='PATH', help='Location of .netrc authentication data; either the path or its containing directory. Defaults to ~/.netrc') + authentication.add_option( + '--netrc-cmd', + dest='netrc_cmd', metavar='NETRC_CMD', + help='Command to execute to get the credentials for an extractor.') authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', @@ -1392,8 +1414,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--clean-info-json', '--clean-infojson', action='store_true', dest='clean_infojson', default=None, help=( - 'Remove some private fields such as filenames from the infojson. ' - 'Note that it could still contain some personal information (default)')) + 'Remove some internal metadata such as filenames from the infojson (default)')) filesystem.add_option( '--no-clean-info-json', '--no-clean-infojson', action='store_false', dest='clean_infojson', @@ -1656,8 +1677,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'Execute a command, optionally prefixed with when to execute it, separated by a ":". ' 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: after_move). ' 'Same syntax as the output template can be used to pass any field as arguments to the command. ' - 'After download, an additional field "filepath" that contains the final path of the downloaded file ' - 'is also available, and if no fields are passed, %(filepath,_filename|)q is appended to the end of the command. ' + 'If no fields are passed, %(filepath,_filename|)q is appended to the end of the command. ' 'This option can be used multiple times')) postproc.add_option( '--no-exec', diff --git a/plugin/yt-dlp/yt_dlp/postprocessor/common.py b/plugin/yt-dlp/yt_dlp/postprocessor/common.py index aa8dbff..cfef843 100644 --- a/plugin/yt-dlp/yt_dlp/postprocessor/common.py +++ b/plugin/yt-dlp/yt_dlp/postprocessor/common.py @@ -187,7 +187,7 @@ def report_progress(self, s): tmpl = progress_template.get('postprocess') if tmpl: self._downloader.to_screen( - self._downloader.evaluate_outtmpl(tmpl, progress_dict), skip_eol=True, quiet=False) + self._downloader.evaluate_outtmpl(tmpl, progress_dict), quiet=False) self._downloader.to_console_title(self._downloader.evaluate_outtmpl( progress_template.get('postprocess-title') or 'yt-dlp %(progress._default_template)s', diff --git a/plugin/yt-dlp/yt_dlp/update.py b/plugin/yt-dlp/yt_dlp/update.py index 1be47e7..2d4b1a4 100644 --- a/plugin/yt-dlp/yt_dlp/update.py +++ b/plugin/yt-dlp/yt_dlp/update.py @@ -16,6 +16,7 @@ Popen, cached_method, deprecation_warning, + network_exceptions, remove_end, remove_start, sanitized_Request, @@ -128,27 +129,36 @@ def __init__(self, ydl, target=None): self.ydl = ydl self.target_channel, sep, self.target_tag = (target or CHANNEL).rpartition('@') - if not sep and self.target_tag in UPDATE_SOURCES: # stable => stable@latest - self.target_channel, self.target_tag = self.target_tag, None + # stable => stable@latest + if not sep and ('/' in self.target_tag or self.target_tag in UPDATE_SOURCES): + self.target_channel = self.target_tag + self.target_tag = None elif not self.target_channel: - self.target_channel = CHANNEL + self.target_channel = CHANNEL.partition('@')[0] if not self.target_tag: - self.target_tag, self._exact = 'latest', False + self.target_tag = 'latest' + self._exact = False elif self.target_tag != 'latest': self.target_tag = f'tags/{self.target_tag}' - @property - def _target_repo(self): - try: - return UPDATE_SOURCES[self.target_channel] - except KeyError: - return self._report_error( - f'Invalid update channel {self.target_channel!r} requested. ' - f'Valid channels are {", ".join(UPDATE_SOURCES)}', True) + if '/' in self.target_channel: + self._target_repo = self.target_channel + if self.target_channel not in (CHANNEL, *UPDATE_SOURCES.values()): + self.ydl.report_warning( + f'You are switching to an {self.ydl._format_err("unofficial", "red")} executable ' + f'from {self.ydl._format_err(self._target_repo, self.ydl.Styles.EMPHASIS)}. ' + f'Run {self.ydl._format_err("at your own risk", "light red")}') + self._block_restart('Automatically restarting into custom builds is disabled for security reasons') + else: + self._target_repo = UPDATE_SOURCES.get(self.target_channel) + if not self._target_repo: + self._report_error( + f'Invalid update channel {self.target_channel!r} requested. ' + f'Valid channels are {", ".join(UPDATE_SOURCES)}', True) def _version_compare(self, a, b, channel=CHANNEL): - if channel != self.target_channel: + if self._exact and channel != self.target_channel: return False if _VERSION_RE.fullmatch(f'{a}.{b}'): @@ -258,8 +268,8 @@ def check_update(self): self.ydl.to_screen(( f'Available version: {self._label(self.target_channel, self.latest_version)}, ' if self.target_tag == 'latest' else '' ) + f'Current version: {self._label(CHANNEL, self.current_version)}') - except Exception: - return self._report_network_error('obtain version info', delim='; Please try again later or') + except network_exceptions as e: + return self._report_network_error(f'obtain version info ({e})', delim='; Please try again later or') if not is_non_updateable(): self.ydl.to_screen(f'Current Build Hash: {_sha256_file(self.filename)}') @@ -284,6 +294,7 @@ def update(self): if (_VERSION_RE.fullmatch(self.target_tag[5:]) and version_tuple(self.target_tag[5:]) < (2023, 3, 2)): self.ydl.report_warning('You are downgrading to a version without --update-to') + self._block_restart('Cannot automatically restart to a version without --update-to') directory = os.path.dirname(self.filename) if not os.access(self.filename, os.W_OK): @@ -303,7 +314,7 @@ def update(self): try: newcontent = self._download(self.release_name, self._tag) - except Exception as e: + except network_exceptions as e: if isinstance(e, urllib.error.HTTPError) and e.code == 404: return self._report_error( f'The requested tag {self._label(self.target_channel, self.target_tag)} does not exist', True) @@ -371,6 +382,12 @@ def restart(self): _, _, returncode = Popen.run(self.cmd) return returncode + def _block_restart(self, msg): + def wrapper(): + self._report_error(f'{msg}. Restart yt-dlp to use the updated version', expected=True) + return self.ydl._download_retcode + self.restart = wrapper + def run_update(ydl): """Update the program file with the latest version from the repository diff --git a/plugin/yt-dlp/yt_dlp/utils/__init__.py b/plugin/yt-dlp/yt_dlp/utils/__init__.py new file mode 100644 index 0000000..74b39e2 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/utils/__init__.py @@ -0,0 +1,14 @@ +import warnings + +from ..compat.compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=5)) +del passthrough_module + +# isort: off +from .traversal import * +from ._utils import * +from ._utils import _configuration_args, _get_exe_version_output +from ._deprecated import * diff --git a/plugin/yt-dlp/yt_dlp/utils/_deprecated.py b/plugin/yt-dlp/yt_dlp/utils/_deprecated.py new file mode 100644 index 0000000..4454d84 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/utils/_deprecated.py @@ -0,0 +1,30 @@ +"""Deprecated - New code should avoid these""" + +from ._utils import preferredencoding + + +def encodeFilename(s, for_subprocess=False): + assert isinstance(s, str) + return s + + +def decodeFilename(b, for_subprocess=False): + return b + + +def decodeArgument(b): + return b + + +def decodeOption(optval): + if optval is None: + return optval + if isinstance(optval, bytes): + optval = optval.decode(preferredencoding()) + + assert isinstance(optval, str) + return optval + + +def error_to_compat_str(err): + return str(err) diff --git a/plugin/yt-dlp/yt_dlp/utils/_legacy.py b/plugin/yt-dlp/yt_dlp/utils/_legacy.py new file mode 100644 index 0000000..96ac468 --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/utils/_legacy.py @@ -0,0 +1,180 @@ +"""No longer used and new code should not use. Exists only for API compat.""" + +import platform +import struct +import sys +import urllib.parse +import zlib + +from ._utils import Popen, decode_base_n, preferredencoding +from .traversal import traverse_obj +from ..dependencies import certifi, websockets + +# isort: split +from ..cookies import YoutubeDLCookieJar # noqa: F401 + +has_certifi = bool(certifi) +has_websockets = bool(websockets) + + +def load_plugins(name, suffix, namespace): + from ..plugins import load_plugins + ret = load_plugins(name, suffix) + namespace.update(ret) + return ret + + +def traverse_dict(dictn, keys, casesense=True): + return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) + + +def decode_base(value, digits): + return decode_base_n(value, table=digits) + + +def platform_name(): + """ Returns the platform name as a str """ + return platform.platform() + + +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding + + +# UNUSED +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise OSError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise OSError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels + + +def register_socks_protocols(): + # "Register" SOCKS protocols + # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 + # URLs with protocols not in urlparse.uses_netloc are not handled correctly + for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): + if scheme not in urllib.parse.uses_netloc: + urllib.parse.uses_netloc.append(scheme) + + +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'} + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers + + +def process_communicate_or_kill(p, *args, **kwargs): + return Popen.communicate_or_kill(p, *args, **kwargs) diff --git a/plugin/yt-dlp/yt_dlp/utils.py b/plugin/yt-dlp/yt_dlp/utils/_utils.py similarity index 85% rename from plugin/yt-dlp/yt_dlp/utils.py rename to plugin/yt-dlp/yt_dlp/utils/_utils.py index 0ae2492..bc1bc91 100644 --- a/plugin/yt-dlp/yt_dlp/utils.py +++ b/plugin/yt-dlp/yt_dlp/utils/_utils.py @@ -1,6528 +1,5971 @@ -import asyncio -import atexit -import base64 -import binascii -import calendar -import codecs -import collections -import collections.abc -import contextlib -import datetime -import email.header -import email.utils -import errno -import gzip -import hashlib -import hmac -import html.entities -import html.parser -import http.client -import http.cookiejar -import inspect -import io -import itertools -import json -import locale -import math -import mimetypes -import operator -import os -import platform -import random -import re -import shlex -import socket -import ssl -import struct -import subprocess -import sys -import tempfile -import time -import traceback -import types -import unicodedata -import urllib.error -import urllib.parse -import urllib.request -import xml.etree.ElementTree -import zlib - -from .compat import functools # isort: split -from .compat import ( - compat_etree_fromstring, - compat_expanduser, - compat_HTMLParseError, - compat_os_name, - compat_shlex_quote, -) -from .dependencies import brotli, certifi, websockets, xattr -from .socks import ProxyType, sockssocket - - -def register_socks_protocols(): - # "Register" SOCKS protocols - # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 - # URLs with protocols not in urlparse.uses_netloc are not handled correctly - for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in urllib.parse.uses_netloc: - urllib.parse.uses_netloc.append(scheme) - - -# This is not clearly defined otherwise -compiled_regex_type = type(re.compile('')) - - -def random_user_agent(): - _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36' - _CHROME_VERSIONS = ( - '90.0.4430.212', - '90.0.4430.24', - '90.0.4430.70', - '90.0.4430.72', - '90.0.4430.85', - '90.0.4430.93', - '91.0.4472.101', - '91.0.4472.106', - '91.0.4472.114', - '91.0.4472.124', - '91.0.4472.164', - '91.0.4472.19', - '91.0.4472.77', - '92.0.4515.107', - '92.0.4515.115', - '92.0.4515.131', - '92.0.4515.159', - '92.0.4515.43', - '93.0.4556.0', - '93.0.4577.15', - '93.0.4577.63', - '93.0.4577.82', - '94.0.4606.41', - '94.0.4606.54', - '94.0.4606.61', - '94.0.4606.71', - '94.0.4606.81', - '94.0.4606.85', - '95.0.4638.17', - '95.0.4638.50', - '95.0.4638.54', - '95.0.4638.69', - '95.0.4638.74', - '96.0.4664.18', - '96.0.4664.45', - '96.0.4664.55', - '96.0.4664.93', - '97.0.4692.20', - ) - return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) - - -SUPPORTED_ENCODINGS = [ - 'gzip', 'deflate' -] -if brotli: - SUPPORTED_ENCODINGS.append('br') - -std_headers = { - 'User-Agent': random_user_agent(), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-us,en;q=0.5', - 'Sec-Fetch-Mode': 'navigate', -} - - -USER_AGENTS = { - 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', -} - - -NO_DEFAULT = object() -IDENTITY = lambda x: x - -ENGLISH_MONTH_NAMES = [ - 'January', 'February', 'March', 'April', 'May', 'June', - 'July', 'August', 'September', 'October', 'November', 'December'] - -MONTH_NAMES = { - 'en': ENGLISH_MONTH_NAMES, - 'fr': [ - 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', - 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], - # these follow the genitive grammatical case (dopełniacz) - # some websites might be using nominative, which will require another month list - # https://en.wikibooks.org/wiki/Polish/Noun_cases - 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca', - 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'], -} - -# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42 -TIMEZONE_NAMES = { - 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0, - 'AST': -4, 'ADT': -3, # Atlantic (used in Canada) - 'EST': -5, 'EDT': -4, # Eastern - 'CST': -6, 'CDT': -5, # Central - 'MST': -7, 'MDT': -6, # Mountain - 'PST': -8, 'PDT': -7 # Pacific -} - -# needed for sanitizing filenames in restricted mode -ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) - -DATE_FORMATS = ( - '%d %B %Y', - '%d %b %Y', - '%B %d %Y', - '%B %dst %Y', - '%B %dnd %Y', - '%B %drd %Y', - '%B %dth %Y', - '%b %d %Y', - '%b %dst %Y', - '%b %dnd %Y', - '%b %drd %Y', - '%b %dth %Y', - '%b %dst %Y %I:%M', - '%b %dnd %Y %I:%M', - '%b %drd %Y %I:%M', - '%b %dth %Y %I:%M', - '%Y %m %d', - '%Y-%m-%d', - '%Y.%m.%d.', - '%Y/%m/%d', - '%Y/%m/%d %H:%M', - '%Y/%m/%d %H:%M:%S', - '%Y%m%d%H%M', - '%Y%m%d%H%M%S', - '%Y%m%d', - '%Y-%m-%d %H:%M', - '%Y-%m-%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S.%f', - '%Y-%m-%d %H:%M:%S:%f', - '%d.%m.%Y %H:%M', - '%d.%m.%Y %H.%M', - '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S.%fZ', - '%Y-%m-%dT%H:%M:%S.%f0Z', - '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S.%f', - '%Y-%m-%dT%H:%M', - '%b %d %Y at %H:%M', - '%b %d %Y at %H:%M:%S', - '%B %d %Y at %H:%M', - '%B %d %Y at %H:%M:%S', - '%H:%M %d-%b-%Y', -) - -DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) -DATE_FORMATS_DAY_FIRST.extend([ - '%d-%m-%Y', - '%d.%m.%Y', - '%d.%m.%y', - '%d/%m/%Y', - '%d/%m/%y', - '%d/%m/%Y %H:%M:%S', - '%d-%m-%Y %H:%M', -]) - -DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) -DATE_FORMATS_MONTH_FIRST.extend([ - '%m-%d-%Y', - '%m.%d.%Y', - '%m/%d/%Y', - '%m/%d/%y', - '%m/%d/%Y %H:%M:%S', -]) - -PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>' - -NUMBER_RE = r'\d+(?:\.\d+)?' - - -@functools.cache -def preferredencoding(): - """Get preferred encoding. - - Returns the best encoding scheme for the system, based on - locale.getpreferredencoding() and some further tweaks. - """ - try: - pref = locale.getpreferredencoding() - 'TEST'.encode(pref) - except Exception: - pref = 'UTF-8' - - return pref - - -def write_json_file(obj, fn): - """ Encode obj as JSON and write it to fn, atomically if possible """ - - tf = tempfile.NamedTemporaryFile( - prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn), - suffix='.tmp', delete=False, mode='w', encoding='utf-8') - - try: - with tf: - json.dump(obj, tf, ensure_ascii=False) - if sys.platform == 'win32': - # Need to remove existing file on Windows, else os.rename raises - # WindowsError or FileExistsError. - with contextlib.suppress(OSError): - os.unlink(fn) - with contextlib.suppress(OSError): - mask = os.umask(0) - os.umask(mask) - os.chmod(tf.name, 0o666 & ~mask) - os.rename(tf.name, fn) - except Exception: - with contextlib.suppress(OSError): - os.remove(tf.name) - raise - - -def find_xpath_attr(node, xpath, key, val=None): - """ Find the xpath xpath[@key=val] """ - assert re.match(r'^[a-zA-Z_-]+$', key) - expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']") - return node.find(expr) - -# On python2.6 the xml.etree.ElementTree.Element methods don't support -# the namespace parameter - - -def xpath_with_ns(path, ns_map): - components = [c.split(':') for c in path.split('/')] - replaced = [] - for c in components: - if len(c) == 1: - replaced.append(c[0]) - else: - ns, tag = c - replaced.append('{%s}%s' % (ns_map[ns], tag)) - return '/'.join(replaced) - - -def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - def _find_xpath(xpath): - return node.find(xpath) - - if isinstance(xpath, str): - n = _find_xpath(xpath) - else: - for xp in xpath: - n = _find_xpath(xp) - if n is not None: - break - - if n is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = xpath if name is None else name - raise ExtractorError('Could not find XML element %s' % name) - else: - return None - return n - - -def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - n = xpath_element(node, xpath, name, fatal=fatal, default=default) - if n is None or n == default: - return n - if n.text is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = xpath if name is None else name - raise ExtractorError('Could not find XML element\'s text %s' % name) - else: - return None - return n.text - - -def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): - n = find_xpath_attr(node, xpath, key) - if n is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = f'{xpath}[@{key}]' if name is None else name - raise ExtractorError('Could not find XML attribute %s' % name) - else: - return None - return n.attrib[key] - - -def get_element_by_id(id, html, **kwargs): - """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute('id', id, html, **kwargs) - - -def get_element_html_by_id(id, html, **kwargs): - """Return the html of the tag with the specified ID in the passed HTML document""" - return get_element_html_by_attribute('id', id, html, **kwargs) - - -def get_element_by_class(class_name, html): - """Return the content of the first tag with the specified class in the passed HTML document""" - retval = get_elements_by_class(class_name, html) - return retval[0] if retval else None - - -def get_element_html_by_class(class_name, html): - """Return the html of the first tag with the specified class in the passed HTML document""" - retval = get_elements_html_by_class(class_name, html) - return retval[0] if retval else None - - -def get_element_by_attribute(attribute, value, html, **kwargs): - retval = get_elements_by_attribute(attribute, value, html, **kwargs) - return retval[0] if retval else None - - -def get_element_html_by_attribute(attribute, value, html, **kargs): - retval = get_elements_html_by_attribute(attribute, value, html, **kargs) - return retval[0] if retval else None - - -def get_elements_by_class(class_name, html, **kargs): - """Return the content of all tags with the specified class in the passed HTML document as a list""" - return get_elements_by_attribute( - 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name), - html, escape_value=False) - - -def get_elements_html_by_class(class_name, html): - """Return the html of all tags with the specified class in the passed HTML document as a list""" - return get_elements_html_by_attribute( - 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name), - html, escape_value=False) - - -def get_elements_by_attribute(*args, **kwargs): - """Return the content of the tag with the specified attribute in the passed HTML document""" - return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)] - - -def get_elements_html_by_attribute(*args, **kwargs): - """Return the html of the tag with the specified attribute in the passed HTML document""" - return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)] - - -def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): - """ - Return the text (content) and the html (whole) of the tag with the specified - attribute in the passed HTML document - """ - if not value: - return - - quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' - - value = re.escape(value) if escape_value else value - - partial_element_re = rf'''(?x) - <(?P<tag>{tag}) - (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? - \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q) - ''' - - for m in re.finditer(partial_element_re, html): - content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) - - yield ( - unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)), - whole - ) - - -class HTMLBreakOnClosingTagParser(html.parser.HTMLParser): - """ - HTML parser which raises HTMLBreakOnClosingTagException upon reaching the - closing tag for the first opening tag it has encountered, and can be used - as a context manager - """ - - class HTMLBreakOnClosingTagException(Exception): - pass - - def __init__(self): - self.tagstack = collections.deque() - html.parser.HTMLParser.__init__(self) - - def __enter__(self): - return self - - def __exit__(self, *_): - self.close() - - def close(self): - # handle_endtag does not return upon raising HTMLBreakOnClosingTagException, - # so data remains buffered; we no longer have any interest in it, thus - # override this method to discard it - pass - - def handle_starttag(self, tag, _): - self.tagstack.append(tag) - - def handle_endtag(self, tag): - if not self.tagstack: - raise compat_HTMLParseError('no tags in the stack') - while self.tagstack: - inner_tag = self.tagstack.pop() - if inner_tag == tag: - break - else: - raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found') - if not self.tagstack: - raise self.HTMLBreakOnClosingTagException() - - -# XXX: This should be far less strict -def get_element_text_and_html_by_tag(tag, html): - """ - For the first element with the specified tag in the passed HTML document - return its' content (text) and the whole element (html) - """ - def find_or_raise(haystack, needle, exc): - try: - return haystack.index(needle) - except ValueError: - raise exc - closing_tag = f'</{tag}>' - whole_start = find_or_raise( - html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found')) - content_start = find_or_raise( - html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag')) - content_start += whole_start + 1 - with HTMLBreakOnClosingTagParser() as parser: - parser.feed(html[whole_start:content_start]) - if not parser.tagstack or parser.tagstack[0] != tag: - raise compat_HTMLParseError(f'parser did not match opening {tag} tag') - offset = content_start - while offset < len(html): - next_closing_tag_start = find_or_raise( - html[offset:], closing_tag, - compat_HTMLParseError(f'closing {tag} tag not found')) - next_closing_tag_end = next_closing_tag_start + len(closing_tag) - try: - parser.feed(html[offset:offset + next_closing_tag_end]) - offset += next_closing_tag_end - except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: - return html[content_start:offset + next_closing_tag_start], \ - html[whole_start:offset + next_closing_tag_end] - raise compat_HTMLParseError('unexpected end of html') - - -class HTMLAttributeParser(html.parser.HTMLParser): - """Trivial HTML parser to gather the attributes for a single element""" - - def __init__(self): - self.attrs = {} - html.parser.HTMLParser.__init__(self) - - def handle_starttag(self, tag, attrs): - self.attrs = dict(attrs) - raise compat_HTMLParseError('done') - - -class HTMLListAttrsParser(html.parser.HTMLParser): - """HTML parser to gather the attributes for the elements of a list""" - - def __init__(self): - html.parser.HTMLParser.__init__(self) - self.items = [] - self._level = 0 - - def handle_starttag(self, tag, attrs): - if tag == 'li' and self._level == 0: - self.items.append(dict(attrs)) - self._level += 1 - - def handle_endtag(self, tag): - self._level -= 1 - - -def extract_attributes(html_element): - """Given a string for an HTML element such as - <el - a="foo" B="bar" c="&98;az" d=boz - empty= noval entity="&" - sq='"' dq="'" - > - Decode and return a dictionary of attributes. - { - 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', - 'empty': '', 'noval': None, 'entity': '&', - 'sq': '"', 'dq': '\'' - }. - """ - parser = HTMLAttributeParser() - with contextlib.suppress(compat_HTMLParseError): - parser.feed(html_element) - parser.close() - return parser.attrs - - -def parse_list(webpage): - """Given a string for an series of HTML <li> elements, - return a dictionary of their attributes""" - parser = HTMLListAttrsParser() - parser.feed(webpage) - parser.close() - return parser.items - - -def clean_html(html): - """Clean an HTML snippet into a readable string""" - - if html is None: # Convenience for sanitizing descriptions etc. - return html - - html = re.sub(r'\s+', ' ', html) - html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html) - html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html) - # Strip html tags - html = re.sub('<.*?>', '', html) - # Replace html entities - html = unescapeHTML(html) - return html.strip() - - -class LenientJSONDecoder(json.JSONDecoder): - # TODO: Write tests - def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs): - self.transform_source, self.ignore_extra = transform_source, ignore_extra - self._close_attempts = 2 * close_objects - super().__init__(*args, **kwargs) - - @staticmethod - def _close_object(err): - doc = err.doc[:err.pos] - # We need to add comma first to get the correct error message - if err.msg.startswith('Expecting \',\''): - return doc + ',' - elif not doc.endswith(','): - return - - if err.msg.startswith('Expecting property name'): - return doc[:-1] + '}' - elif err.msg.startswith('Expecting value'): - return doc[:-1] + ']' - - def decode(self, s): - if self.transform_source: - s = self.transform_source(s) - for attempt in range(self._close_attempts + 1): - try: - if self.ignore_extra: - return self.raw_decode(s.lstrip())[0] - return super().decode(s) - except json.JSONDecodeError as e: - if e.pos is None: - raise - elif attempt < self._close_attempts: - s = self._close_object(e) - if s is not None: - continue - raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos) - assert False, 'Too many attempts to decode JSON' - - -def sanitize_open(filename, open_mode): - """Try to open the given filename, and slightly tweak it if this fails. - - Attempts to open the given filename. If this fails, it tries to change - the filename slightly, step by step, until it's either able to open it - or it fails and raises a final exception, like the standard open() - function. - - It returns the tuple (stream, definitive_file_name). - """ - if filename == '-': - if sys.platform == 'win32': - import msvcrt - - # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout - with contextlib.suppress(io.UnsupportedOperation): - msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) - return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) - - for attempt in range(2): - try: - try: - if sys.platform == 'win32': - # FIXME: An exclusive lock also locks the file from being read. - # Since windows locks are mandatory, don't lock the file on windows (for now). - # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124 - raise LockingUnsupportedError() - stream = locked_file(filename, open_mode, block=False).__enter__() - except OSError: - stream = open(filename, open_mode) - return stream, filename - except OSError as err: - if attempt or err.errno in (errno.EACCES,): - raise - old_filename, filename = filename, sanitize_path(filename) - if old_filename == filename: - raise - - -def timeconvert(timestr): - """Convert RFC 2822 defined time string into system timestamp""" - timestamp = None - timetuple = email.utils.parsedate_tz(timestr) - if timetuple is not None: - timestamp = email.utils.mktime_tz(timetuple) - return timestamp - - -def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): - """Sanitizes a string so it could be used as part of a filename. - @param restricted Use a stricter subset of allowed characters - @param is_id Whether this is an ID that should be kept unchanged if possible. - If unset, yt-dlp's new sanitization rules are in effect - """ - if s == '': - return '' - - def replace_insane(char): - if restricted and char in ACCENT_CHARS: - return ACCENT_CHARS[char] - elif not restricted and char == '\n': - return '\0 ' - elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\': - # Replace with their full-width unicode counterparts - return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0)) - elif char == '?' or ord(char) < 32 or ord(char) == 127: - return '' - elif char == '"': - return '' if restricted else '\'' - elif char == ':': - return '\0_\0-' if restricted else '\0 \0-' - elif char in '\\/|*<>': - return '\0_' - if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127): - return '\0_' - return char - - # Replace look-alike Unicode glyphs - if restricted and (is_id is NO_DEFAULT or not is_id): - s = unicodedata.normalize('NFKC', s) - s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps - result = ''.join(map(replace_insane, s)) - if is_id is NO_DEFAULT: - result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars - STRIP_RE = r'(?:\0.|[ _-])*' - result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end - result = result.replace('\0', '') or '_' - - if not is_id: - while '__' in result: - result = result.replace('__', '_') - result = result.strip('_') - # Common case of "Foreign band name - English song title" - if restricted and result.startswith('-_'): - result = result[2:] - if result.startswith('-'): - result = '_' + result[len('-'):] - result = result.lstrip('.') - if not result: - result = '_' - return result - - -def sanitize_path(s, force=False): - """Sanitizes and normalizes path on Windows""" - if sys.platform == 'win32': - force = False - drive_or_unc, _ = os.path.splitdrive(s) - elif force: - drive_or_unc = '' - else: - return s - - norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) - if drive_or_unc: - norm_path.pop(0) - sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) - for path_part in norm_path] - if drive_or_unc: - sanitized_path.insert(0, drive_or_unc + os.path.sep) - elif force and s and s[0] == os.path.sep: - sanitized_path.insert(0, os.path.sep) - return os.path.join(*sanitized_path) - - -def sanitize_url(url, *, scheme='http'): - # Prepend protocol-less URLs with `http:` scheme in order to mitigate - # the number of unwanted failures due to missing protocol - if url is None: - return - elif url.startswith('//'): - return f'{scheme}:{url}' - # Fix some common typos seen so far - COMMON_TYPOS = ( - # https://github.com/ytdl-org/youtube-dl/issues/15649 - (r'^httpss://', r'https://'), - # https://bx1.be/lives/direct-tv/ - (r'^rmtp([es]?)://', r'rtmp\1://'), - ) - for mistake, fixup in COMMON_TYPOS: - if re.match(mistake, url): - return re.sub(mistake, fixup, url) - return url - - -def extract_basic_auth(url): - parts = urllib.parse.urlsplit(url) - if parts.username is None: - return url, None - url = urllib.parse.urlunsplit(parts._replace(netloc=( - parts.hostname if parts.port is None - else '%s:%d' % (parts.hostname, parts.port)))) - auth_payload = base64.b64encode( - ('%s:%s' % (parts.username, parts.password or '')).encode()) - return url, f'Basic {auth_payload.decode()}' - - -def sanitized_Request(url, *args, **kwargs): - url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) - if auth_header is not None: - headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) - headers['Authorization'] = auth_header - return urllib.request.Request(url, *args, **kwargs) - - -def expand_path(s): - """Expand shell variables and ~""" - return os.path.expandvars(compat_expanduser(s)) - - -def orderedSet(iterable, *, lazy=False): - """Remove all duplicates from the input iterable""" - def _iter(): - seen = [] # Do not use set since the items can be unhashable - for x in iterable: - if x not in seen: - seen.append(x) - yield x - - return _iter() if lazy else list(_iter()) - - -def _htmlentity_transform(entity_with_semicolon): - """Transforms an HTML entity to a character.""" - entity = entity_with_semicolon[:-1] - - # Known non-numeric HTML entity - if entity in html.entities.name2codepoint: - return chr(html.entities.name2codepoint[entity]) - - # TODO: HTML5 allows entities without a semicolon. - # E.g. 'Éric' should be decoded as 'Éric'. - if entity_with_semicolon in html.entities.html5: - return html.entities.html5[entity_with_semicolon] - - mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) - if mobj is not None: - numstr = mobj.group(1) - if numstr.startswith('x'): - base = 16 - numstr = '0%s' % numstr - else: - base = 10 - # See https://github.com/ytdl-org/youtube-dl/issues/7518 - with contextlib.suppress(ValueError): - return chr(int(numstr, base)) - - # Unknown entity in name, return its literal representation - return '&%s;' % entity - - -def unescapeHTML(s): - if s is None: - return None - assert isinstance(s, str) - - return re.sub( - r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) - - -def escapeHTML(text): - return ( - text - .replace('&', '&') - .replace('<', '<') - .replace('>', '>') - .replace('"', '"') - .replace("'", ''') - ) - - -def process_communicate_or_kill(p, *args, **kwargs): - deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed ' - f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead') - return Popen.communicate_or_kill(p, *args, **kwargs) - - -class Popen(subprocess.Popen): - if sys.platform == 'win32': - _startupinfo = subprocess.STARTUPINFO() - _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW - else: - _startupinfo = None - - @staticmethod - def _fix_pyinstaller_ld_path(env): - """Restore LD_LIBRARY_PATH when using PyInstaller - Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations - https://github.com/yt-dlp/yt-dlp/issues/4573 - """ - if not hasattr(sys, '_MEIPASS'): - return - - def _fix(key): - orig = env.get(f'{key}_ORIG') - if orig is None: - env.pop(key, None) - else: - env[key] = orig - - _fix('LD_LIBRARY_PATH') # Linux - _fix('DYLD_LIBRARY_PATH') # macOS - - def __init__(self, *args, env=None, text=False, **kwargs): - if env is None: - env = os.environ.copy() - self._fix_pyinstaller_ld_path(env) - - self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines') - if text is True: - kwargs['universal_newlines'] = True # For 3.6 compatibility - kwargs.setdefault('encoding', 'utf-8') - kwargs.setdefault('errors', 'replace') - super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo) - - def communicate_or_kill(self, *args, **kwargs): - try: - return self.communicate(*args, **kwargs) - except BaseException: # Including KeyboardInterrupt - self.kill(timeout=None) - raise - - def kill(self, *, timeout=0): - super().kill() - if timeout != 0: - self.wait(timeout=timeout) - - @classmethod - def run(cls, *args, timeout=None, **kwargs): - with cls(*args, **kwargs) as proc: - default = '' if proc.__text_mode else b'' - stdout, stderr = proc.communicate_or_kill(timeout=timeout) - return stdout or default, stderr or default, proc.returncode - - -def get_subprocess_encoding(): - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # For subprocess calls, encode with locale encoding - # Refer to http://stackoverflow.com/a/9951851/35070 - encoding = preferredencoding() - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return encoding - - -def encodeFilename(s, for_subprocess=False): - assert isinstance(s, str) - return s - - -def decodeFilename(b, for_subprocess=False): - return b - - -def encodeArgument(s): - # Legacy code that uses byte strings - # Uncomment the following line after fixing all post processors - # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s)) - return s if isinstance(s, str) else s.decode('ascii') - - -def decodeArgument(b): - return b - - -def decodeOption(optval): - if optval is None: - return optval - if isinstance(optval, bytes): - optval = optval.decode(preferredencoding()) - - assert isinstance(optval, str) - return optval - - -_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds')) - - -def timetuple_from_msec(msec): - secs, msec = divmod(msec, 1000) - mins, secs = divmod(secs, 60) - hrs, mins = divmod(mins, 60) - return _timetuple(hrs, mins, secs, msec) - - -def formatSeconds(secs, delim=':', msec=False): - time = timetuple_from_msec(secs * 1000) - if time.hours: - ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds) - elif time.minutes: - ret = '%d%s%02d' % (time.minutes, delim, time.seconds) - else: - ret = '%d' % time.seconds - return '%s.%03d' % (ret, time.milliseconds) if msec else ret - - -def _ssl_load_windows_store_certs(ssl_context, storename): - # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py - try: - certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename) - if encoding == 'x509_asn' and ( - trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)] - except PermissionError: - return - for cert in certs: - with contextlib.suppress(ssl.SSLError): - ssl_context.load_verify_locations(cadata=cert) - - -def make_HTTPS_handler(params, **kwargs): - opts_check_certificate = not params.get('nocheckcertificate') - context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - context.check_hostname = opts_check_certificate - if params.get('legacyserverconnect'): - context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT - # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998 - context.set_ciphers('DEFAULT') - elif ( - sys.version_info < (3, 10) - and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1) - and not ssl.OPENSSL_VERSION.startswith('LibreSSL') - ): - # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1]. - # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting - # in some situations [2][3]. - # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely - # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe. - # LibreSSL is excluded until further investigation due to cipher support issues [5][6]. - # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536 - # 2. https://github.com/yt-dlp/yt-dlp/issues/4627 - # 3. https://github.com/yt-dlp/yt-dlp/pull/5294 - # 4. https://peps.python.org/pep-0644/ - # 5. https://peps.python.org/pep-0644/#libressl-support - # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368 - context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM') - context.minimum_version = ssl.TLSVersion.TLSv1_2 - - context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE - if opts_check_certificate: - if has_certifi and 'no-certifi' not in params.get('compat_opts', []): - context.load_verify_locations(cafile=certifi.where()) - else: - try: - context.load_default_certs() - # Work around the issue in load_default_certs when there are bad certificates. See: - # https://github.com/yt-dlp/yt-dlp/issues/1060, - # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 - except ssl.SSLError: - # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 - if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): - for storename in ('CA', 'ROOT'): - _ssl_load_windows_store_certs(context, storename) - context.set_default_verify_paths() - - client_certfile = params.get('client_certificate') - if client_certfile: - try: - context.load_cert_chain( - client_certfile, keyfile=params.get('client_certificate_key'), - password=params.get('client_certificate_password')) - except ssl.SSLError: - raise YoutubeDLError('Unable to load client certificate') - - # Some servers may reject requests if ALPN extension is not sent. See: - # https://github.com/python/cpython/issues/85140 - # https://github.com/yt-dlp/yt-dlp/issues/3878 - with contextlib.suppress(NotImplementedError): - context.set_alpn_protocols(['http/1.1']) - - return YoutubeDLHTTPSHandler(params, context=context, **kwargs) - - -def bug_reports_message(before=';'): - from .update import REPOSITORY - - msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , ' - 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U') - - before = before.rstrip() - if not before or before.endswith(('.', '!', '?')): - msg = msg[0].title() + msg[1:] - - return (before + ' ' if before else '') + msg - - -class YoutubeDLError(Exception): - """Base exception for YoutubeDL errors.""" - msg = None - - def __init__(self, msg=None): - if msg is not None: - self.msg = msg - elif self.msg is None: - self.msg = type(self).__name__ - super().__init__(self.msg) - - -network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error] -if hasattr(ssl, 'CertificateError'): - network_exceptions.append(ssl.CertificateError) -network_exceptions = tuple(network_exceptions) - - -class ExtractorError(YoutubeDLError): - """Error during info extraction.""" - - def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None): - """ tb, if given, is the original traceback (so that it can be printed out). - If expected is set, this is a normal error message and most likely not a bug in yt-dlp. - """ - if sys.exc_info()[0] in network_exceptions: - expected = True - - self.orig_msg = str(msg) - self.traceback = tb - self.expected = expected - self.cause = cause - self.video_id = video_id - self.ie = ie - self.exc_info = sys.exc_info() # preserve original exception - if isinstance(self.exc_info[1], ExtractorError): - self.exc_info = self.exc_info[1].exc_info - super().__init__(self.__msg) - - @property - def __msg(self): - return ''.join(( - format_field(self.ie, None, '[%s] '), - format_field(self.video_id, None, '%s: '), - self.orig_msg, - format_field(self.cause, None, ' (caused by %r)'), - '' if self.expected else bug_reports_message())) - - def format_traceback(self): - return join_nonempty( - self.traceback and ''.join(traceback.format_tb(self.traceback)), - self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]), - delim='\n') or None - - def __setattr__(self, name, value): - super().__setattr__(name, value) - if getattr(self, 'msg', None) and name not in ('msg', 'args'): - self.msg = self.__msg or type(self).__name__ - self.args = (self.msg, ) # Cannot be property - - -class UnsupportedError(ExtractorError): - def __init__(self, url): - super().__init__( - 'Unsupported URL: %s' % url, expected=True) - self.url = url - - -class RegexNotFoundError(ExtractorError): - """Error when a regex didn't match""" - pass - - -class GeoRestrictedError(ExtractorError): - """Geographic restriction Error exception. - - This exception may be thrown when a video is not available from your - geographic location due to geographic restrictions imposed by a website. - """ - - def __init__(self, msg, countries=None, **kwargs): - kwargs['expected'] = True - super().__init__(msg, **kwargs) - self.countries = countries - - -class UserNotLive(ExtractorError): - """Error when a channel/user is not live""" - - def __init__(self, msg=None, **kwargs): - kwargs['expected'] = True - super().__init__(msg or 'The channel is not currently live', **kwargs) - - -class DownloadError(YoutubeDLError): - """Download Error exception. - - This exception may be thrown by FileDownloader objects if they are not - configured to continue on errors. They will contain the appropriate - error message. - """ - - def __init__(self, msg, exc_info=None): - """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ - super().__init__(msg) - self.exc_info = exc_info - - -class EntryNotInPlaylist(YoutubeDLError): - """Entry not in playlist exception. - - This exception will be thrown by YoutubeDL when a requested entry - is not found in the playlist info_dict - """ - msg = 'Entry not found in info' - - -class SameFileError(YoutubeDLError): - """Same File exception. - - This exception will be thrown by FileDownloader objects if they detect - multiple files would have to be downloaded to the same file on disk. - """ - msg = 'Fixed output name but more than one file to download' - - def __init__(self, filename=None): - if filename is not None: - self.msg += f': {filename}' - super().__init__(self.msg) - - -class PostProcessingError(YoutubeDLError): - """Post Processing exception. - - This exception may be raised by PostProcessor's .run() method to - indicate an error in the postprocessing task. - """ - - -class DownloadCancelled(YoutubeDLError): - """ Exception raised when the download queue should be interrupted """ - msg = 'The download was cancelled' - - -class ExistingVideoReached(DownloadCancelled): - """ --break-on-existing triggered """ - msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing' - - -class RejectedVideoReached(DownloadCancelled): - """ --break-match-filter triggered """ - msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter' - - -class MaxDownloadsReached(DownloadCancelled): - """ --max-downloads limit has been reached. """ - msg = 'Maximum number of downloads reached, stopping due to --max-downloads' - - -class ReExtractInfo(YoutubeDLError): - """ Video info needs to be re-extracted. """ - - def __init__(self, msg, expected=False): - super().__init__(msg) - self.expected = expected - - -class ThrottledDownload(ReExtractInfo): - """ Download speed below --throttled-rate. """ - msg = 'The download speed is below throttle limit' - - def __init__(self): - super().__init__(self.msg, expected=False) - - -class UnavailableVideoError(YoutubeDLError): - """Unavailable Format exception. - - This exception will be thrown when a video is requested - in a format that is not available for that video. - """ - msg = 'Unable to download video' - - def __init__(self, err=None): - if err is not None: - self.msg += f': {err}' - super().__init__(self.msg) - - -class ContentTooShortError(YoutubeDLError): - """Content Too Short exception. - - This exception may be raised by FileDownloader objects when a file they - download is too small for what the server announced first, indicating - the connection was probably interrupted. - """ - - def __init__(self, downloaded, expected): - super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes') - # Both in bytes - self.downloaded = downloaded - self.expected = expected - - -class XAttrMetadataError(YoutubeDLError): - def __init__(self, code=None, msg='Unknown error'): - super().__init__(msg) - self.code = code - self.msg = msg - - # Parsing code and msg - if (self.code in (errno.ENOSPC, errno.EDQUOT) - or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg): - self.reason = 'NO_SPACE' - elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: - self.reason = 'VALUE_TOO_LONG' - else: - self.reason = 'NOT_SUPPORTED' - - -class XAttrUnavailableError(YoutubeDLError): - pass - - -def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): - hc = http_class(*args, **kwargs) - source_address = ydl_handler._params.get('source_address') - - if source_address is not None: - # This is to workaround _create_connection() from socket where it will try all - # address data from getaddrinfo() including IPv6. This filters the result from - # getaddrinfo() based on the source_address value. - # This is based on the cpython socket.create_connection() function. - # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 - def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): - host, port = address - err = None - addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) - af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 - ip_addrs = [addr for addr in addrs if addr[0] == af] - if addrs and not ip_addrs: - ip_version = 'v4' if af == socket.AF_INET else 'v6' - raise OSError( - "No remote IP%s addresses available for connect, can't use '%s' as source address" - % (ip_version, source_address[0])) - for res in ip_addrs: - af, socktype, proto, canonname, sa = res - sock = None - try: - sock = socket.socket(af, socktype, proto) - if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: - sock.settimeout(timeout) - sock.bind(source_address) - sock.connect(sa) - err = None # Explicitly break reference cycle - return sock - except OSError as _: - err = _ - if sock is not None: - sock.close() - if err is not None: - raise err - else: - raise OSError('getaddrinfo returns an empty list') - if hasattr(hc, '_create_connection'): - hc._create_connection = _create_connection - hc.source_address = (source_address, 0) - - return hc - - -def handle_youtubedl_headers(headers): - filtered_headers = headers - - if 'Youtubedl-no-compression' in filtered_headers: - filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'} - del filtered_headers['Youtubedl-no-compression'] - - return filtered_headers - - -class YoutubeDLHandler(urllib.request.HTTPHandler): - """Handler for HTTP requests and responses. - - This class, when installed with an OpenerDirector, automatically adds - the standard headers to every HTTP request and handles gzipped and - deflated responses from web servers. If compression is to be avoided in - a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-no-compression", which will be - removed before making the real request. - - Part of this code was copied from: - - http://techknack.net/python-urllib2-handlers/ - - Andrew Rowls, the author of that code, agreed to release it to the - public domain. - """ - - def __init__(self, params, *args, **kwargs): - urllib.request.HTTPHandler.__init__(self, *args, **kwargs) - self._params = params - - def http_open(self, req): - conn_class = http.client.HTTPConnection - - socks_proxy = req.headers.get('Ytdl-socks-proxy') - if socks_proxy: - conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] - - return self.do_open(functools.partial( - _create_http_connection, self, conn_class, False), - req) - - @staticmethod - def deflate(data): - if not data: - return data - try: - return zlib.decompress(data, -zlib.MAX_WBITS) - except zlib.error: - return zlib.decompress(data) - - @staticmethod - def brotli(data): - if not data: - return data - return brotli.decompress(data) - - def http_request(self, req): - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) - # the code of this workaround has been moved here from YoutubeDL.urlopen() - url = req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - req = update_Request(req, url=url_escaped) - - for h, v in self._params.get('http_headers', std_headers).items(): - # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 - # The dict keys are capitalized because of this bug by urllib - if h.capitalize() not in req.headers: - req.add_header(h, v) - - if 'Accept-encoding' not in req.headers: - req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS)) - - req.headers = handle_youtubedl_headers(req.headers) - - return super().do_request_(req) - - def http_response(self, req, resp): - old_resp = resp - # gzip - if resp.headers.get('Content-encoding', '') == 'gzip': - content = resp.read() - gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') - try: - uncompressed = io.BytesIO(gz.read()) - except OSError as original_ioerror: - # There may be junk add the end of the file - # See http://stackoverflow.com/q/4928560/35070 for details - for i in range(1, 1024): - try: - gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') - uncompressed = io.BytesIO(gz.read()) - except OSError: - continue - break - else: - raise original_ioerror - resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # deflate - if resp.headers.get('Content-encoding', '') == 'deflate': - gz = io.BytesIO(self.deflate(resp.read())) - resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # brotli - if resp.headers.get('Content-encoding', '') == 'br': - resp = urllib.request.addinfourl( - io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see - # https://github.com/ytdl-org/youtube-dl/issues/6457). - if 300 <= resp.code < 400: - location = resp.headers.get('Location') - if location: - # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 - location = location.encode('iso-8859-1').decode() - location_escaped = escape_url(location) - if location != location_escaped: - del resp.headers['Location'] - resp.headers['Location'] = location_escaped - return resp - - https_request = http_request - https_response = http_response - - -def make_socks_conn_class(base_class, socks_proxy): - assert issubclass(base_class, ( - http.client.HTTPConnection, http.client.HTTPSConnection)) - - url_components = urllib.parse.urlparse(socks_proxy) - if url_components.scheme.lower() == 'socks5': - socks_type = ProxyType.SOCKS5 - elif url_components.scheme.lower() in ('socks', 'socks4'): - socks_type = ProxyType.SOCKS4 - elif url_components.scheme.lower() == 'socks4a': - socks_type = ProxyType.SOCKS4A - - def unquote_if_non_empty(s): - if not s: - return s - return urllib.parse.unquote_plus(s) - - proxy_args = ( - socks_type, - url_components.hostname, url_components.port or 1080, - True, # Remote DNS - unquote_if_non_empty(url_components.username), - unquote_if_non_empty(url_components.password), - ) - - class SocksConnection(base_class): - def connect(self): - self.sock = sockssocket() - self.sock.setproxy(*proxy_args) - if isinstance(self.timeout, (int, float)): - self.sock.settimeout(self.timeout) - self.sock.connect((self.host, self.port)) - - if isinstance(self, http.client.HTTPSConnection): - if hasattr(self, '_context'): # Python > 2.6 - self.sock = self._context.wrap_socket( - self.sock, server_hostname=self.host) - else: - self.sock = ssl.wrap_socket(self.sock) - - return SocksConnection - - -class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler): - def __init__(self, params, https_conn_class=None, *args, **kwargs): - urllib.request.HTTPSHandler.__init__(self, *args, **kwargs) - self._https_conn_class = https_conn_class or http.client.HTTPSConnection - self._params = params - - def https_open(self, req): - kwargs = {} - conn_class = self._https_conn_class - - if hasattr(self, '_context'): # python > 2.6 - kwargs['context'] = self._context - if hasattr(self, '_check_hostname'): # python 3.x - kwargs['check_hostname'] = self._check_hostname - - socks_proxy = req.headers.get('Ytdl-socks-proxy') - if socks_proxy: - conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] - - try: - return self.do_open( - functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs) - except urllib.error.URLError as e: - if (isinstance(e.reason, ssl.SSLError) - and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'): - raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect') - raise - - -def is_path_like(f): - return isinstance(f, (str, bytes, os.PathLike)) - - -class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): - """ - See [1] for cookie file format. - - 1. https://curl.haxx.se/docs/http-cookies.html - """ - _HTTPONLY_PREFIX = '#HttpOnly_' - _ENTRY_LEN = 7 - _HEADER = '''# Netscape HTTP Cookie File -# This file is generated by yt-dlp. Do not edit. - -''' - _CookieFileEntry = collections.namedtuple( - 'CookieFileEntry', - ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) - - def __init__(self, filename=None, *args, **kwargs): - super().__init__(None, *args, **kwargs) - if is_path_like(filename): - filename = os.fspath(filename) - self.filename = filename - - @staticmethod - def _true_or_false(cndn): - return 'TRUE' if cndn else 'FALSE' - - @contextlib.contextmanager - def open(self, file, *, write=False): - if is_path_like(file): - with open(file, 'w' if write else 'r', encoding='utf-8') as f: - yield f - else: - if write: - file.truncate(0) - yield file - - def _really_save(self, f, ignore_discard=False, ignore_expires=False): - now = time.time() - for cookie in self: - if (not ignore_discard and cookie.discard - or not ignore_expires and cookie.is_expired(now)): - continue - name, value = cookie.name, cookie.value - if value is None: - # cookies.txt regards 'Set-Cookie: foo' as a cookie - # with no name, whereas http.cookiejar regards it as a - # cookie with no value. - name, value = '', name - f.write('%s\n' % '\t'.join(( - cookie.domain, - self._true_or_false(cookie.domain.startswith('.')), - cookie.path, - self._true_or_false(cookie.secure), - str_or_none(cookie.expires, default=''), - name, value - ))) - - def save(self, filename=None, *args, **kwargs): - """ - Save cookies to a file. - Code is taken from CPython 3.6 - https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ - - if filename is None: - if self.filename is not None: - filename = self.filename - else: - raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) - - # Store session cookies with `expires` set to 0 instead of an empty string - for cookie in self: - if cookie.expires is None: - cookie.expires = 0 - - with self.open(filename, write=True) as f: - f.write(self._HEADER) - self._really_save(f, *args, **kwargs) - - def load(self, filename=None, ignore_discard=False, ignore_expires=False): - """Load cookies from a file.""" - if filename is None: - if self.filename is not None: - filename = self.filename - else: - raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) - - def prepare_line(line): - if line.startswith(self._HTTPONLY_PREFIX): - line = line[len(self._HTTPONLY_PREFIX):] - # comments and empty lines are fine - if line.startswith('#') or not line.strip(): - return line - cookie_list = line.split('\t') - if len(cookie_list) != self._ENTRY_LEN: - raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) - cookie = self._CookieFileEntry(*cookie_list) - if cookie.expires_at and not cookie.expires_at.isdigit(): - raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) - return line - - cf = io.StringIO() - with self.open(filename) as f: - for line in f: - try: - cf.write(prepare_line(line)) - except http.cookiejar.LoadError as e: - if f'{line.strip()} '[0] in '[{"': - raise http.cookiejar.LoadError( - 'Cookies file must be Netscape formatted, not JSON. See ' - 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') - write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') - continue - cf.seek(0) - self._really_load(cf, filename, ignore_discard, ignore_expires) - # Session cookies are denoted by either `expires` field set to - # an empty string or 0. MozillaCookieJar only recognizes the former - # (see [1]). So we need force the latter to be recognized as session - # cookies on our own. - # Session cookies may be important for cookies-based authentication, - # e.g. usually, when user does not check 'Remember me' check box while - # logging in on a site, some important cookies are stored as session - # cookies so that not recognizing them will result in failed login. - # 1. https://bugs.python.org/issue17164 - for cookie in self: - # Treat `expires=0` cookies as session cookies - if cookie.expires == 0: - cookie.expires = None - cookie.discard = True - - -class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor): - def __init__(self, cookiejar=None): - urllib.request.HTTPCookieProcessor.__init__(self, cookiejar) - - def http_response(self, request, response): - return urllib.request.HTTPCookieProcessor.http_response(self, request, response) - - https_request = urllib.request.HTTPCookieProcessor.http_request - https_response = http_response - - -class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler): - """YoutubeDL redirect handler - - The code is based on HTTPRedirectHandler implementation from CPython [1]. - - This redirect handler solves two issues: - - ensures redirect URL is always unicode under python 2 - - introduces support for experimental HTTP response status code - 308 Permanent Redirect [2] used by some sites [3] - - 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py - 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 - 3. https://github.com/ytdl-org/youtube-dl/issues/28768 - """ - - http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 - - def redirect_request(self, req, fp, code, msg, headers, newurl): - """Return a Request or None in response to a redirect. - - This is called by the http_error_30x methods when a - redirection response is received. If a redirection should - take place, return a new Request to allow http_error_30x to - perform the redirect. Otherwise, raise HTTPError if no-one - else should try to handle this url. Return None if you can't - but another Handler might. - """ - m = req.get_method() - if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST")): - raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) - # Strictly (according to RFC 2616), 301 or 302 in response to - # a POST MUST NOT cause a redirection without confirmation - # from the user (of urllib.request, in this case). In practice, - # essentially all clients do redirect in this case, so we do - # the same. - - # Be conciliant with URIs containing a space. This is mainly - # redundant with the more complete encoding done in http_error_302(), - # but it is kept for compatibility with other callers. - newurl = newurl.replace(' ', '%20') - - CONTENT_HEADERS = ("content-length", "content-type") - # NB: don't use dict comprehension for python 2.6 compatibility - newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS} - - # A 303 must either use GET or HEAD for subsequent request - # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 - if code == 303 and m != 'HEAD': - m = 'GET' - # 301 and 302 redirects are commonly turned into a GET from a POST - # for subsequent requests by browsers, so we'll do the same. - # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2 - # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3 - if code in (301, 302) and m == 'POST': - m = 'GET' - - return urllib.request.Request( - newurl, headers=newheaders, origin_req_host=req.origin_req_host, - unverifiable=True, method=m) - - -def extract_timezone(date_str): - m = re.search( - r'''(?x) - ^.{8,}? # >=8 char non-TZ prefix, if present - (?P<tz>Z| # just the UTC Z, or - (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or - (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits - [ ]? # optional space - (?P<sign>\+|-) # +/- - (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm - $) - ''', date_str) - if not m: - m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) - timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) - if timezone is not None: - date_str = date_str[:-len(m.group('tz'))] - timezone = datetime.timedelta(hours=timezone or 0) - else: - date_str = date_str[:-len(m.group('tz'))] - if not m.group('sign'): - timezone = datetime.timedelta() - else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) - return timezone, date_str - - -def parse_iso8601(date_str, delimiter='T', timezone=None): - """ Return a UNIX timestamp from the given date """ - - if date_str is None: - return None - - date_str = re.sub(r'\.[0-9]+', '', date_str) - - if timezone is None: - timezone, date_str = extract_timezone(date_str) - - with contextlib.suppress(ValueError): - date_format = f'%Y-%m-%d{delimiter}%H:%M:%S' - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) - - -def date_formats(day_first=True): - return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST - - -def unified_strdate(date_str, day_first=True): - """Return a string with the date in the format YYYYMMDD""" - - if date_str is None: - return None - upload_date = None - # Replace commas - date_str = date_str.replace(',', ' ') - # Remove AM/PM + timezone - date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) - _, date_str = extract_timezone(date_str) - - for expression in date_formats(day_first): - with contextlib.suppress(ValueError): - upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') - if upload_date is None: - timetuple = email.utils.parsedate_tz(date_str) - if timetuple: - with contextlib.suppress(ValueError): - upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - if upload_date is not None: - return str(upload_date) - - -def unified_timestamp(date_str, day_first=True): - if date_str is None: - return None - - date_str = re.sub(r'\s+', ' ', re.sub( - r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str)) - - pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 - timezone, date_str = extract_timezone(date_str) - - # Remove AM/PM + timezone - date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) - - # Remove unrecognized timezones from ISO 8601 alike timestamps - m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) - if m: - date_str = date_str[:-len(m.group('tz'))] - - # Python only supports microseconds, so remove nanoseconds - m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) - if m: - date_str = m.group(1) - - for expression in date_formats(day_first): - with contextlib.suppress(ValueError): - dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) - return calendar.timegm(dt.timetuple()) - - timetuple = email.utils.parsedate_tz(date_str) - if timetuple: - return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() - - -def determine_ext(url, default_ext='unknown_video'): - if url is None or '.' not in url: - return default_ext - guess = url.partition('?')[0].rpartition('.')[2] - if re.match(r'^[A-Za-z0-9]+$', guess): - return guess - # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download - elif guess.rstrip('/') in KNOWN_EXTENSIONS: - return guess.rstrip('/') - else: - return default_ext - - -def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): - return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) - - -def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): - R""" - Return a datetime object from a string. - Supported format: - (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)? - - @param format strftime format of DATE - @param precision Round the datetime object: auto|microsecond|second|minute|hour|day - auto: round to the unit provided in date_str (if applicable). - """ - auto_precision = False - if precision == 'auto': - auto_precision = True - precision = 'microsecond' - today = datetime_round(datetime.datetime.utcnow(), precision) - if date_str in ('now', 'today'): - return today - if date_str == 'yesterday': - return today - datetime.timedelta(days=1) - match = re.match( - r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?', - date_str) - if match is not None: - start_time = datetime_from_str(match.group('start'), precision, format) - time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1) - unit = match.group('unit') - if unit == 'month' or unit == 'year': - new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time) - unit = 'day' - else: - if unit == 'week': - unit = 'day' - time *= 7 - delta = datetime.timedelta(**{unit + 's': time}) - new_date = start_time + delta - if auto_precision: - return datetime_round(new_date, unit) - return new_date - - return datetime_round(datetime.datetime.strptime(date_str, format), precision) - - -def date_from_str(date_str, format='%Y%m%d', strict=False): - R""" - Return a date object from a string using datetime_from_str - - @param strict Restrict allowed patterns to "YYYYMMDD" and - (now|today|yesterday)(-\d+(day|week|month|year)s?)? - """ - if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str): - raise ValueError(f'Invalid date format "{date_str}"') - return datetime_from_str(date_str, precision='microsecond', format=format).date() - - -def datetime_add_months(dt, months): - """Increment/Decrement a datetime object by months.""" - month = dt.month + months - 1 - year = dt.year + month // 12 - month = month % 12 + 1 - day = min(dt.day, calendar.monthrange(year, month)[1]) - return dt.replace(year, month, day) - - -def datetime_round(dt, precision='day'): - """ - Round a datetime object's time to a specific precision - """ - if precision == 'microsecond': - return dt - - unit_seconds = { - 'day': 86400, - 'hour': 3600, - 'minute': 60, - 'second': 1, - } - roundto = lambda x, n: ((x + n / 2) // n) * n - timestamp = calendar.timegm(dt.timetuple()) - return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision])) - - -def hyphenate_date(date_str): - """ - Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format""" - match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str) - if match is not None: - return '-'.join(match.groups()) - else: - return date_str - - -class DateRange: - """Represents a time interval between two dates""" - - def __init__(self, start=None, end=None): - """start and end must be strings in the format accepted by date""" - if start is not None: - self.start = date_from_str(start, strict=True) - else: - self.start = datetime.datetime.min.date() - if end is not None: - self.end = date_from_str(end, strict=True) - else: - self.end = datetime.datetime.max.date() - if self.start > self.end: - raise ValueError('Date range: "%s" , the start date must be before the end date' % self) - - @classmethod - def day(cls, day): - """Returns a range that only contains the given day""" - return cls(day, day) - - def __contains__(self, date): - """Check if the date is in the range""" - if not isinstance(date, datetime.date): - date = date_from_str(date) - return self.start <= date <= self.end - - def __str__(self): - return f'{self.start.isoformat()} - {self.end.isoformat()}' - - def __eq__(self, other): - return (isinstance(other, DateRange) - and self.start == other.start and self.end == other.end) - - -def platform_name(): - """ Returns the platform name as a str """ - deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead') - return platform.platform() - - -@functools.cache -def system_identifier(): - python_implementation = platform.python_implementation() - if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'): - python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3] - libc_ver = [] - with contextlib.suppress(OSError): # We may not have access to the executable - libc_ver = platform.libc_ver() - - return 'Python %s (%s %s %s) - %s (%s%s)' % ( - platform.python_version(), - python_implementation, - platform.machine(), - platform.architecture()[0], - platform.platform(), - ssl.OPENSSL_VERSION, - format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'), - ) - - -@functools.cache -def get_windows_version(): - ''' Get Windows version. returns () if it's not running on Windows ''' - if compat_os_name == 'nt': - return version_tuple(platform.win32_ver()[1]) - else: - return () - - -def write_string(s, out=None, encoding=None): - assert isinstance(s, str) - out = out or sys.stderr - # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217) - if not out: - return - - if compat_os_name == 'nt' and supports_terminal_sequences(out): - s = re.sub(r'([\r\n]+)', r' \1', s) - - enc, buffer = None, out - if 'b' in getattr(out, 'mode', ''): - enc = encoding or preferredencoding() - elif hasattr(out, 'buffer'): - buffer = out.buffer - enc = encoding or getattr(out, 'encoding', None) or preferredencoding() - - buffer.write(s.encode(enc, 'ignore') if enc else s) - out.flush() - - -def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): - from . import _IN_CLI - if _IN_CLI: - if msg in deprecation_warning._cache: - return - deprecation_warning._cache.add(msg) - if printer: - return printer(f'{msg}{bug_reports_message()}', **kwargs) - return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs) - else: - import warnings - warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3) - - -deprecation_warning._cache = set() - - -def bytes_to_intlist(bs): - if not bs: - return [] - if isinstance(bs[0], int): # Python 3 - return list(bs) - else: - return [ord(c) for c in bs] - - -def intlist_to_bytes(xs): - if not xs: - return b'' - return struct.pack('%dB' % len(xs), *xs) - - -class LockingUnsupportedError(OSError): - msg = 'File locking is not supported' - - def __init__(self): - super().__init__(self.msg) - - -# Cross-platform file locking -if sys.platform == 'win32': - import ctypes - import ctypes.wintypes - import msvcrt - - class OVERLAPPED(ctypes.Structure): - _fields_ = [ - ('Internal', ctypes.wintypes.LPVOID), - ('InternalHigh', ctypes.wintypes.LPVOID), - ('Offset', ctypes.wintypes.DWORD), - ('OffsetHigh', ctypes.wintypes.DWORD), - ('hEvent', ctypes.wintypes.HANDLE), - ] - - kernel32 = ctypes.WinDLL('kernel32') - LockFileEx = kernel32.LockFileEx - LockFileEx.argtypes = [ - ctypes.wintypes.HANDLE, # hFile - ctypes.wintypes.DWORD, # dwFlags - ctypes.wintypes.DWORD, # dwReserved - ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow - ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh - ctypes.POINTER(OVERLAPPED) # Overlapped - ] - LockFileEx.restype = ctypes.wintypes.BOOL - UnlockFileEx = kernel32.UnlockFileEx - UnlockFileEx.argtypes = [ - ctypes.wintypes.HANDLE, # hFile - ctypes.wintypes.DWORD, # dwReserved - ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow - ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh - ctypes.POINTER(OVERLAPPED) # Overlapped - ] - UnlockFileEx.restype = ctypes.wintypes.BOOL - whole_low = 0xffffffff - whole_high = 0x7fffffff - - def _lock_file(f, exclusive, block): - overlapped = OVERLAPPED() - overlapped.Offset = 0 - overlapped.OffsetHigh = 0 - overlapped.hEvent = 0 - f._lock_file_overlapped_p = ctypes.pointer(overlapped) - - if not LockFileEx(msvcrt.get_osfhandle(f.fileno()), - (0x2 if exclusive else 0x0) | (0x0 if block else 0x1), - 0, whole_low, whole_high, f._lock_file_overlapped_p): - # NB: No argument form of "ctypes.FormatError" does not work on PyPy - raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}') - - def _unlock_file(f): - assert f._lock_file_overlapped_p - handle = msvcrt.get_osfhandle(f.fileno()) - if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p): - raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) - -else: - try: - import fcntl - - def _lock_file(f, exclusive, block): - flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH - if not block: - flags |= fcntl.LOCK_NB - try: - fcntl.flock(f, flags) - except BlockingIOError: - raise - except OSError: # AOSP does not have flock() - fcntl.lockf(f, flags) - - def _unlock_file(f): - with contextlib.suppress(OSError): - return fcntl.flock(f, fcntl.LOCK_UN) - with contextlib.suppress(OSError): - return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock() - return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking - - except ImportError: - - def _lock_file(f, exclusive, block): - raise LockingUnsupportedError() - - def _unlock_file(f): - raise LockingUnsupportedError() - - -class locked_file: - locked = False - - def __init__(self, filename, mode, block=True, encoding=None): - if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}: - raise NotImplementedError(mode) - self.mode, self.block = mode, block - - writable = any(f in mode for f in 'wax+') - readable = any(f in mode for f in 'r+') - flags = functools.reduce(operator.ior, ( - getattr(os, 'O_CLOEXEC', 0), # UNIX only - getattr(os, 'O_BINARY', 0), # Windows only - getattr(os, 'O_NOINHERIT', 0), # Windows only - os.O_CREAT if writable else 0, # O_TRUNC only after locking - os.O_APPEND if 'a' in mode else 0, - os.O_EXCL if 'x' in mode else 0, - os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY, - )) - - self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding) - - def __enter__(self): - exclusive = 'r' not in self.mode - try: - _lock_file(self.f, exclusive, self.block) - self.locked = True - except OSError: - self.f.close() - raise - if 'w' in self.mode: - try: - self.f.truncate() - except OSError as e: - if e.errno not in ( - errno.ESPIPE, # Illegal seek - expected for FIFO - errno.EINVAL, # Invalid argument - expected for /dev/null - ): - raise - return self - - def unlock(self): - if not self.locked: - return - try: - _unlock_file(self.f) - finally: - self.locked = False - - def __exit__(self, *_): - try: - self.unlock() - finally: - self.f.close() - - open = __enter__ - close = __exit__ - - def __getattr__(self, attr): - return getattr(self.f, attr) - - def __iter__(self): - return iter(self.f) - - -@functools.cache -def get_filesystem_encoding(): - encoding = sys.getfilesystemencoding() - return encoding if encoding is not None else 'utf-8' - - -def shell_quote(args): - quoted_args = [] - encoding = get_filesystem_encoding() - for a in args: - if isinstance(a, bytes): - # We may get a filename encoded with 'encodeFilename' - a = a.decode(encoding) - quoted_args.append(compat_shlex_quote(a)) - return ' '.join(quoted_args) - - -def smuggle_url(url, data): - """ Pass additional data in a URL for internal use. """ - - url, idata = unsmuggle_url(url, {}) - data.update(idata) - sdata = urllib.parse.urlencode( - {'__youtubedl_smuggle': json.dumps(data)}) - return url + '#' + sdata - - -def unsmuggle_url(smug_url, default=None): - if '#__youtubedl_smuggle' not in smug_url: - return smug_url, default - url, _, sdata = smug_url.rpartition('#') - jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0] - data = json.loads(jsond) - return url, data - - -def format_decimal_suffix(num, fmt='%d%s', *, factor=1000): - """ Formats numbers with decimal sufixes like K, M, etc """ - num, factor = float_or_none(num), float(factor) - if num is None or num < 0: - return None - POSSIBLE_SUFFIXES = 'kMGTPEZY' - exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES)) - suffix = ['', *POSSIBLE_SUFFIXES][exponent] - if factor == 1024: - suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i') - converted = num / (factor ** exponent) - return fmt % (converted, suffix) - - -def format_bytes(bytes): - return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A' - - -def lookup_unit_table(unit_table, s, strict=False): - num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]') - units_re = '|'.join(re.escape(u) for u in unit_table) - m = (re.fullmatch if strict else re.match)( - rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s) - if not m: - return None - - num = float(m.group('num').replace(',', '.')) - mult = unit_table[m.group('unit')] - return round(num * mult) - - -def parse_bytes(s): - """Parse a string indicating a byte quantity into an integer""" - return lookup_unit_table( - {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])}, - s.upper(), strict=True) - - -def parse_filesize(s): - if s is None: - return None - - # The lower-case forms are of course incorrect and unofficial, - # but we support those too - _UNIT_TABLE = { - 'B': 1, - 'b': 1, - 'bytes': 1, - 'KiB': 1024, - 'KB': 1000, - 'kB': 1024, - 'Kb': 1000, - 'kb': 1000, - 'kilobytes': 1000, - 'kibibytes': 1024, - 'MiB': 1024 ** 2, - 'MB': 1000 ** 2, - 'mB': 1024 ** 2, - 'Mb': 1000 ** 2, - 'mb': 1000 ** 2, - 'megabytes': 1000 ** 2, - 'mebibytes': 1024 ** 2, - 'GiB': 1024 ** 3, - 'GB': 1000 ** 3, - 'gB': 1024 ** 3, - 'Gb': 1000 ** 3, - 'gb': 1000 ** 3, - 'gigabytes': 1000 ** 3, - 'gibibytes': 1024 ** 3, - 'TiB': 1024 ** 4, - 'TB': 1000 ** 4, - 'tB': 1024 ** 4, - 'Tb': 1000 ** 4, - 'tb': 1000 ** 4, - 'terabytes': 1000 ** 4, - 'tebibytes': 1024 ** 4, - 'PiB': 1024 ** 5, - 'PB': 1000 ** 5, - 'pB': 1024 ** 5, - 'Pb': 1000 ** 5, - 'pb': 1000 ** 5, - 'petabytes': 1000 ** 5, - 'pebibytes': 1024 ** 5, - 'EiB': 1024 ** 6, - 'EB': 1000 ** 6, - 'eB': 1024 ** 6, - 'Eb': 1000 ** 6, - 'eb': 1000 ** 6, - 'exabytes': 1000 ** 6, - 'exbibytes': 1024 ** 6, - 'ZiB': 1024 ** 7, - 'ZB': 1000 ** 7, - 'zB': 1024 ** 7, - 'Zb': 1000 ** 7, - 'zb': 1000 ** 7, - 'zettabytes': 1000 ** 7, - 'zebibytes': 1024 ** 7, - 'YiB': 1024 ** 8, - 'YB': 1000 ** 8, - 'yB': 1024 ** 8, - 'Yb': 1000 ** 8, - 'yb': 1000 ** 8, - 'yottabytes': 1000 ** 8, - 'yobibytes': 1024 ** 8, - } - - return lookup_unit_table(_UNIT_TABLE, s) - - -def parse_count(s): - if s is None: - return None - - s = re.sub(r'^[^\d]+\s', '', s).strip() - - if re.match(r'^[\d,.]+$', s): - return str_to_int(s) - - _UNIT_TABLE = { - 'k': 1000, - 'K': 1000, - 'm': 1000 ** 2, - 'M': 1000 ** 2, - 'kk': 1000 ** 2, - 'KK': 1000 ** 2, - 'b': 1000 ** 3, - 'B': 1000 ** 3, - } - - ret = lookup_unit_table(_UNIT_TABLE, s) - if ret is not None: - return ret - - mobj = re.match(r'([\d,.]+)(?:$|\s)', s) - if mobj: - return str_to_int(mobj.group(1)) - - -def parse_resolution(s, *, lenient=False): - if s is None: - return {} - - if lenient: - mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s) - else: - mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s) - if mobj: - return { - 'width': int(mobj.group('w')), - 'height': int(mobj.group('h')), - } - - mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s) - if mobj: - return {'height': int(mobj.group(1))} - - mobj = re.search(r'\b([48])[kK]\b', s) - if mobj: - return {'height': int(mobj.group(1)) * 540} - - return {} - - -def parse_bitrate(s): - if not isinstance(s, str): - return - mobj = re.search(r'\b(\d+)\s*kbps', s) - if mobj: - return int(mobj.group(1)) - - -def month_by_name(name, lang='en'): - """ Return the number of a month by (locale-independently) English name """ - - month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en']) - - try: - return month_names.index(name) + 1 - except ValueError: - return None - - -def month_by_abbreviation(abbrev): - """ Return the number of a month by (locale-independently) English - abbreviations """ - - try: - return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1 - except ValueError: - return None - - -def fix_xml_ampersands(xml_str): - """Replace all the '&' by '&' in XML""" - return re.sub( - r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', - '&', - xml_str) - - -def setproctitle(title): - assert isinstance(title, str) - - # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541 - try: - import ctypes - except ImportError: - return - - try: - libc = ctypes.cdll.LoadLibrary('libc.so.6') - except OSError: - return - except TypeError: - # LoadLibrary in Windows Python 2.7.13 only expects - # a bytestring, but since unicode_literals turns - # every string into a unicode string, it fails. - return - title_bytes = title.encode() - buf = ctypes.create_string_buffer(len(title_bytes)) - buf.value = title_bytes - try: - libc.prctl(15, buf, 0, 0, 0) - except AttributeError: - return # Strange libc, just skip this - - -def remove_start(s, start): - return s[len(start):] if s is not None and s.startswith(start) else s - - -def remove_end(s, end): - return s[:-len(end)] if s is not None and s.endswith(end) else s - - -def remove_quotes(s): - if s is None or len(s) < 2: - return s - for quote in ('"', "'", ): - if s[0] == quote and s[-1] == quote: - return s[1:-1] - return s - - -def get_domain(url): - """ - This implementation is inconsistent, but is kept for compatibility. - Use this only for "webpage_url_domain" - """ - return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None - - -def url_basename(url): - path = urllib.parse.urlparse(url).path - return path.strip('/').split('/')[-1] - - -def base_url(url): - return re.match(r'https?://[^?#]+/', url).group() - - -def urljoin(base, path): - if isinstance(path, bytes): - path = path.decode() - if not isinstance(path, str) or not path: - return None - if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): - return path - if isinstance(base, bytes): - base = base.decode() - if not isinstance(base, str) or not re.match( - r'^(?:https?:)?//', base): - return None - return urllib.parse.urljoin(base, path) - - -class HEADRequest(urllib.request.Request): - def get_method(self): - return 'HEAD' - - -class PUTRequest(urllib.request.Request): - def get_method(self): - return 'PUT' - - -def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): - if get_attr and v is not None: - v = getattr(v, get_attr, None) - try: - return int(v) * invscale // scale - except (ValueError, TypeError, OverflowError): - return default - - -def str_or_none(v, default=None): - return default if v is None else str(v) - - -def str_to_int(int_str): - """ A more relaxed version of int_or_none """ - if isinstance(int_str, int): - return int_str - elif isinstance(int_str, str): - int_str = re.sub(r'[,\.\+]', '', int_str) - return int_or_none(int_str) - - -def float_or_none(v, scale=1, invscale=1, default=None): - if v is None: - return default - try: - return float(v) * invscale / scale - except (ValueError, TypeError): - return default - - -def bool_or_none(v, default=None): - return v if isinstance(v, bool) else default - - -def strip_or_none(v, default=None): - return v.strip() if isinstance(v, str) else default - - -def url_or_none(url): - if not url or not isinstance(url, str): - return None - url = url.strip() - return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None - - -def request_to_url(req): - if isinstance(req, urllib.request.Request): - return req.get_full_url() - else: - return req - - -def strftime_or_none(timestamp, date_format, default=None): - datetime_object = None - try: - if isinstance(timestamp, (int, float)): # unix timestamp - # Using naive datetime here can break timestamp() in Windows - # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414 - datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc) - elif isinstance(timestamp, str): # assume YYYYMMDD - datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') - date_format = re.sub( # Support %s on windows - r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format) - return datetime_object.strftime(date_format) - except (ValueError, TypeError, AttributeError): - return default - - -def parse_duration(s): - if not isinstance(s, str): - return None - s = s.strip() - if not s: - return None - - days, hours, mins, secs, ms = [None] * 5 - m = re.match(r'''(?x) - (?P<before_secs> - (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)? - (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+)) - (?P<ms>[.:][0-9]+)?Z?$ - ''', s) - if m: - days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms') - else: - m = re.match( - r'''(?ix)(?:P? - (?: - [0-9]+\s*y(?:ears?)?,?\s* - )? - (?: - [0-9]+\s*m(?:onths?)?,?\s* - )? - (?: - [0-9]+\s*w(?:eeks?)?,?\s* - )? - (?: - (?P<days>[0-9]+)\s*d(?:ays?)?,?\s* - )? - T)? - (?: - (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s* - )? - (?: - (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s* - )? - (?: - (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* - )?Z?$''', s) - if m: - days, hours, mins, secs, ms = m.groups() - else: - m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) - if m: - hours, mins = m.groups() - else: - return None - - if ms: - ms = ms.replace(':', '.') - return sum(float(part or 0) * mult for part, mult in ( - (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1))) - - -def prepend_extension(filename, ext, expected_real_ext=None): - name, real_ext = os.path.splitext(filename) - return ( - f'{name}.{ext}{real_ext}' - if not expected_real_ext or real_ext[1:] == expected_real_ext - else f'{filename}.{ext}') - - -def replace_extension(filename, ext, expected_real_ext=None): - name, real_ext = os.path.splitext(filename) - return '{}.{}'.format( - name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, - ext) - - -def check_executable(exe, args=[]): - """ Checks if the given binary is installed somewhere in PATH, and returns its name. - args can be a list of arguments for a short output (like -version) """ - try: - Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except OSError: - return False - return exe - - -def _get_exe_version_output(exe, args): - try: - # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers - # SIGTTOU if yt-dlp is run in the background. - # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 - stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - if ret: - return None - except OSError: - return False - return stdout - - -def detect_exe_version(output, version_re=None, unrecognized='present'): - assert isinstance(output, str) - if version_re is None: - version_re = r'version\s+([-0-9._a-zA-Z]+)' - m = re.search(version_re, output) - if m: - return m.group(1) - else: - return unrecognized - - -def get_exe_version(exe, args=['--version'], - version_re=None, unrecognized=('present', 'broken')): - """ Returns the version of the specified executable, - or False if the executable is not present """ - unrecognized = variadic(unrecognized) - assert len(unrecognized) in (1, 2) - out = _get_exe_version_output(exe, args) - if out is None: - return unrecognized[-1] - return out and detect_exe_version(out, version_re, unrecognized[0]) - - -def frange(start=0, stop=None, step=1): - """Float range""" - if stop is None: - start, stop = 0, start - sign = [-1, 1][step > 0] if step else 0 - while sign * start < sign * stop: - yield start - start += step - - -class LazyList(collections.abc.Sequence): - """Lazy immutable list from an iterable - Note that slices of a LazyList are lists and not LazyList""" - - class IndexError(IndexError): - pass - - def __init__(self, iterable, *, reverse=False, _cache=None): - self._iterable = iter(iterable) - self._cache = [] if _cache is None else _cache - self._reversed = reverse - - def __iter__(self): - if self._reversed: - # We need to consume the entire iterable to iterate in reverse - yield from self.exhaust() - return - yield from self._cache - for item in self._iterable: - self._cache.append(item) - yield item - - def _exhaust(self): - self._cache.extend(self._iterable) - self._iterable = [] # Discard the emptied iterable to make it pickle-able - return self._cache - - def exhaust(self): - """Evaluate the entire iterable""" - return self._exhaust()[::-1 if self._reversed else 1] - - @staticmethod - def _reverse_index(x): - return None if x is None else ~x - - def __getitem__(self, idx): - if isinstance(idx, slice): - if self._reversed: - idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1)) - start, stop, step = idx.start, idx.stop, idx.step or 1 - elif isinstance(idx, int): - if self._reversed: - idx = self._reverse_index(idx) - start, stop, step = idx, idx, 0 - else: - raise TypeError('indices must be integers or slices') - if ((start or 0) < 0 or (stop or 0) < 0 - or (start is None and step < 0) - or (stop is None and step > 0)): - # We need to consume the entire iterable to be able to slice from the end - # Obviously, never use this with infinite iterables - self._exhaust() - try: - return self._cache[idx] - except IndexError as e: - raise self.IndexError(e) from e - n = max(start or 0, stop or 0) - len(self._cache) + 1 - if n > 0: - self._cache.extend(itertools.islice(self._iterable, n)) - try: - return self._cache[idx] - except IndexError as e: - raise self.IndexError(e) from e - - def __bool__(self): - try: - self[-1] if self._reversed else self[0] - except self.IndexError: - return False - return True - - def __len__(self): - self._exhaust() - return len(self._cache) - - def __reversed__(self): - return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache) - - def __copy__(self): - return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache) - - def __repr__(self): - # repr and str should mimic a list. So we exhaust the iterable - return repr(self.exhaust()) - - def __str__(self): - return repr(self.exhaust()) - - -class PagedList: - - class IndexError(IndexError): - pass - - def __len__(self): - # This is only useful for tests - return len(self.getslice()) - - def __init__(self, pagefunc, pagesize, use_cache=True): - self._pagefunc = pagefunc - self._pagesize = pagesize - self._pagecount = float('inf') - self._use_cache = use_cache - self._cache = {} - - def getpage(self, pagenum): - page_results = self._cache.get(pagenum) - if page_results is None: - page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum)) - if self._use_cache: - self._cache[pagenum] = page_results - return page_results - - def getslice(self, start=0, end=None): - return list(self._getslice(start, end)) - - def _getslice(self, start, end): - raise NotImplementedError('This method must be implemented by subclasses') - - def __getitem__(self, idx): - assert self._use_cache, 'Indexing PagedList requires cache' - if not isinstance(idx, int) or idx < 0: - raise TypeError('indices must be non-negative integers') - entries = self.getslice(idx, idx + 1) - if not entries: - raise self.IndexError() - return entries[0] - - -class OnDemandPagedList(PagedList): - """Download pages until a page with less than maximum results""" - - def _getslice(self, start, end): - for pagenum in itertools.count(start // self._pagesize): - firstid = pagenum * self._pagesize - nextfirstid = pagenum * self._pagesize + self._pagesize - if start >= nextfirstid: - continue - - startv = ( - start % self._pagesize - if firstid <= start < nextfirstid - else 0) - endv = ( - ((end - 1) % self._pagesize) + 1 - if (end is not None and firstid <= end <= nextfirstid) - else None) - - try: - page_results = self.getpage(pagenum) - except Exception: - self._pagecount = pagenum - 1 - raise - if startv != 0 or endv is not None: - page_results = page_results[startv:endv] - yield from page_results - - # A little optimization - if current page is not "full", ie. does - # not contain page_size videos then we can assume that this page - # is the last one - there are no more ids on further pages - - # i.e. no need to query again. - if len(page_results) + startv < self._pagesize: - break - - # If we got the whole page, but the next page is not interesting, - # break out early as well - if end == nextfirstid: - break - - -class InAdvancePagedList(PagedList): - """PagedList with total number of pages known in advance""" - - def __init__(self, pagefunc, pagecount, pagesize): - PagedList.__init__(self, pagefunc, pagesize, True) - self._pagecount = pagecount - - def _getslice(self, start, end): - start_page = start // self._pagesize - end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1) - skip_elems = start - start_page * self._pagesize - only_more = None if end is None else end - start - for pagenum in range(start_page, end_page): - page_results = self.getpage(pagenum) - if skip_elems: - page_results = page_results[skip_elems:] - skip_elems = None - if only_more is not None: - if len(page_results) < only_more: - only_more -= len(page_results) - else: - yield from page_results[:only_more] - break - yield from page_results - - -class PlaylistEntries: - MissingEntry = object() - is_exhausted = False - - def __init__(self, ydl, info_dict): - self.ydl = ydl - - # _entries must be assigned now since infodict can change during iteration - entries = info_dict.get('entries') - if entries is None: - raise EntryNotInPlaylist('There are no entries') - elif isinstance(entries, list): - self.is_exhausted = True - - requested_entries = info_dict.get('requested_entries') - self.is_incomplete = requested_entries is not None - if self.is_incomplete: - assert self.is_exhausted - self._entries = [self.MissingEntry] * max(requested_entries or [0]) - for i, entry in zip(requested_entries, entries): - self._entries[i - 1] = entry - elif isinstance(entries, (list, PagedList, LazyList)): - self._entries = entries - else: - self._entries = LazyList(entries) - - PLAYLIST_ITEMS_RE = re.compile(r'''(?x) - (?P<start>[+-]?\d+)? - (?P<range>[:-] - (?P<end>[+-]?\d+|inf(?:inite)?)? - (?::(?P<step>[+-]?\d+))? - )?''') - - @classmethod - def parse_playlist_items(cls, string): - for segment in string.split(','): - if not segment: - raise ValueError('There is two or more consecutive commas') - mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment) - if not mobj: - raise ValueError(f'{segment!r} is not a valid specification') - start, end, step, has_range = mobj.group('start', 'end', 'step', 'range') - if int_or_none(step) == 0: - raise ValueError(f'Step in {segment!r} cannot be zero') - yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start) - - def get_requested_items(self): - playlist_items = self.ydl.params.get('playlist_items') - playlist_start = self.ydl.params.get('playliststart', 1) - playlist_end = self.ydl.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlist_end in (-1, None): - playlist_end = '' - if not playlist_items: - playlist_items = f'{playlist_start}:{playlist_end}' - elif playlist_start != 1 or playlist_end: - self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True) - - for index in self.parse_playlist_items(playlist_items): - for i, entry in self[index]: - yield i, entry - if not entry: - continue - try: - # The item may have just been added to archive. Don't break due to it - if not self.ydl.params.get('lazy_playlist'): - # TODO: Add auto-generated fields - self.ydl._match_entry(entry, incomplete=True, silent=True) - except (ExistingVideoReached, RejectedVideoReached): - return - - def get_full_count(self): - if self.is_exhausted and not self.is_incomplete: - return len(self) - elif isinstance(self._entries, InAdvancePagedList): - if self._entries._pagesize == 1: - return self._entries._pagecount - - @functools.cached_property - def _getter(self): - if isinstance(self._entries, list): - def get_entry(i): - try: - entry = self._entries[i] - except IndexError: - entry = self.MissingEntry - if not self.is_incomplete: - raise self.IndexError() - if entry is self.MissingEntry: - raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found') - return entry - else: - def get_entry(i): - try: - return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i) - except (LazyList.IndexError, PagedList.IndexError): - raise self.IndexError() - return get_entry - - def __getitem__(self, idx): - if isinstance(idx, int): - idx = slice(idx, idx) - - # NB: PlaylistEntries[1:10] => (0, 1, ... 9) - step = 1 if idx.step is None else idx.step - if idx.start is None: - start = 0 if step > 0 else len(self) - 1 - else: - start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start - - # NB: Do not call len(self) when idx == [:] - if idx.stop is None: - stop = 0 if step < 0 else float('inf') - else: - stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop - stop += [-1, 1][step > 0] - - for i in frange(start, stop, step): - if i < 0: - continue - try: - entry = self._getter(i) - except self.IndexError: - self.is_exhausted = True - if step > 0: - break - continue - yield i + 1, entry - - def __len__(self): - return len(tuple(self[:])) - - class IndexError(IndexError): - pass - - -def uppercase_escape(s): - unicode_escape = codecs.getdecoder('unicode_escape') - return re.sub( - r'\\U[0-9a-fA-F]{8}', - lambda m: unicode_escape(m.group(0))[0], - s) - - -def lowercase_escape(s): - unicode_escape = codecs.getdecoder('unicode_escape') - return re.sub( - r'\\u[0-9a-fA-F]{4}', - lambda m: unicode_escape(m.group(0))[0], - s) - - -def escape_rfc3986(s): - """Escape non-ASCII characters as suggested by RFC 3986""" - return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") - - -def escape_url(url): - """Escape URL as suggested by RFC 3986""" - url_parsed = urllib.parse.urlparse(url) - return url_parsed._replace( - netloc=url_parsed.netloc.encode('idna').decode('ascii'), - path=escape_rfc3986(url_parsed.path), - params=escape_rfc3986(url_parsed.params), - query=escape_rfc3986(url_parsed.query), - fragment=escape_rfc3986(url_parsed.fragment) - ).geturl() - - -def parse_qs(url, **kwargs): - return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs) - - -def read_batch_urls(batch_fd): - def fixup(url): - if not isinstance(url, str): - url = url.decode('utf-8', 'replace') - BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff') - for bom in BOM_UTF8: - if url.startswith(bom): - url = url[len(bom):] - url = url.lstrip() - if not url or url.startswith(('#', ';', ']')): - return False - # "#" cannot be stripped out since it is part of the URI - # However, it can be safely stripped out if following a whitespace - return re.split(r'\s#', url, 1)[0].rstrip() - - with contextlib.closing(batch_fd) as fd: - return [url for url in map(fixup, fd) if url] - - -def urlencode_postdata(*args, **kargs): - return urllib.parse.urlencode(*args, **kargs).encode('ascii') - - -def update_url(url, *, query_update=None, **kwargs): - """Replace URL components specified by kwargs - @param url str or parse url tuple - @param query_update update query - @returns str - """ - if isinstance(url, str): - if not kwargs and not query_update: - return url - else: - url = urllib.parse.urlparse(url) - if query_update: - assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time' - kwargs['query'] = urllib.parse.urlencode({ - **urllib.parse.parse_qs(url.query), - **query_update - }, True) - return urllib.parse.urlunparse(url._replace(**kwargs)) - - -def update_url_query(url, query): - return update_url(url, query_update=query) - - -def update_Request(req, url=None, data=None, headers=None, query=None): - req_headers = req.headers.copy() - req_headers.update(headers or {}) - req_data = data or req.data - req_url = update_url_query(url or req.get_full_url(), query) - req_get_method = req.get_method() - if req_get_method == 'HEAD': - req_type = HEADRequest - elif req_get_method == 'PUT': - req_type = PUTRequest - else: - req_type = urllib.request.Request - new_req = req_type( - req_url, data=req_data, headers=req_headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - if hasattr(req, 'timeout'): - new_req.timeout = req.timeout - return new_req - - -def _multipart_encode_impl(data, boundary): - content_type = 'multipart/form-data; boundary=%s' % boundary - - out = b'' - for k, v in data.items(): - out += b'--' + boundary.encode('ascii') + b'\r\n' - if isinstance(k, str): - k = k.encode() - if isinstance(v, str): - v = v.encode() - # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 - # suggests sending UTF-8 directly. Firefox sends UTF-8, too - content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' - if boundary.encode('ascii') in content: - raise ValueError('Boundary overlaps with data') - out += content - - out += b'--' + boundary.encode('ascii') + b'--\r\n' - - return out, content_type - - -def multipart_encode(data, boundary=None): - ''' - Encode a dict to RFC 7578-compliant form-data - - data: - A dict where keys and values can be either Unicode or bytes-like - objects. - boundary: - If specified a Unicode object, it's used as the boundary. Otherwise - a random boundary is generated. - - Reference: https://tools.ietf.org/html/rfc7578 - ''' - has_specified_boundary = boundary is not None - - while True: - if boundary is None: - boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) - - try: - out, content_type = _multipart_encode_impl(data, boundary) - break - except ValueError: - if has_specified_boundary: - raise - boundary = None - - return out, content_type - - -def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT): - if blocked_types is NO_DEFAULT: - blocked_types = (str, bytes, collections.abc.Mapping) - return isinstance(x, allowed_types) and not isinstance(x, blocked_types) - - -def variadic(x, allowed_types=NO_DEFAULT): - return x if is_iterable_like(x, blocked_types=allowed_types) else (x,) - - -def dict_get(d, key_or_keys, default=None, skip_false_values=True): - for val in map(d.get, variadic(key_or_keys)): - if val is not None and (val or not skip_false_values): - return val - return default - - -def try_call(*funcs, expected_type=None, args=[], kwargs={}): - for f in funcs: - try: - val = f(*args, **kwargs) - except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError): - pass - else: - if expected_type is None or isinstance(val, expected_type): - return val - - -def try_get(src, getter, expected_type=None): - return try_call(*variadic(getter), args=(src,), expected_type=expected_type) - - -def filter_dict(dct, cndn=lambda _, v: v is not None): - return {k: v for k, v in dct.items() if cndn(k, v)} - - -def merge_dicts(*dicts): - merged = {} - for a_dict in dicts: - for k, v in a_dict.items(): - if (v is not None and k not in merged - or isinstance(v, str) and merged[k] == ''): - merged[k] = v - return merged - - -def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): - return string if isinstance(string, str) else str(string, encoding, errors) - - -US_RATINGS = { - 'G': 0, - 'PG': 10, - 'PG-13': 13, - 'R': 16, - 'NC': 18, -} - - -TV_PARENTAL_GUIDELINES = { - 'TV-Y': 0, - 'TV-Y7': 7, - 'TV-G': 0, - 'TV-PG': 0, - 'TV-14': 14, - 'TV-MA': 17, -} - - -def parse_age_limit(s): - # isinstance(False, int) is True. So type() must be used instead - if type(s) is int: # noqa: E721 - return s if 0 <= s <= 21 else None - elif not isinstance(s, str): - return None - m = re.match(r'^(?P<age>\d{1,2})\+?$', s) - if m: - return int(m.group('age')) - s = s.upper() - if s in US_RATINGS: - return US_RATINGS[s] - m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) - if m: - return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] - return None - - -def strip_jsonp(code): - return re.sub( - r'''(?sx)^ - (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) - (?:\s*&&\s*(?P=func_name))? - \s*\(\s*(?P<callback_data>.*)\);? - \s*?(?://[^\n]*)*$''', - r'\g<callback_data>', code) - - -def js_to_json(code, vars={}, *, strict=False): - # vars is a dict of var, val pairs to substitute - STRING_QUOTES = '\'"`' - STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES) - COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' - SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*' - INTEGER_TABLE = ( - (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16), - (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8), - ) - - def process_escape(match): - JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu' - escape = match.group(1) or match.group(2) - - return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES - else R'\u00' if escape == 'x' - else '' if escape == '\n' - else escape) - - def template_substitute(match): - evaluated = js_to_json(match.group(1), vars, strict=strict) - if evaluated[0] == '"': - return json.loads(evaluated) - return evaluated - - def fix_kv(m): - v = m.group(0) - if v in ('true', 'false', 'null'): - return v - elif v in ('undefined', 'void 0'): - return 'null' - elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': - return '' - - if v[0] in STRING_QUOTES: - v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1] - escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v) - return f'"{escaped}"' - - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return f'"{i}":' if v.endswith(':') else str(i) - - if v in vars: - try: - if not strict: - json.loads(vars[v]) - except json.JSONDecodeError: - return json.dumps(vars[v]) - else: - return vars[v] - - if not strict: - return f'"{v}"' - - raise ValueError(f'Unknown value: {v}') - - def create_map(mobj): - return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) - - code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) - if not strict: - code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) - code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) - code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code) - code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code) - - return re.sub(rf'''(?sx) - {STRING_RE}| - {COMMENT_RE}|,(?={SKIP_RE}[\]}}])| - void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*| - \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?| - [0-9]+(?={SKIP_RE}:)| - !+ - ''', fix_kv, code) - - -def qualities(quality_ids): - """ Get a numeric quality value out of a list of possible values """ - def q(qid): - try: - return quality_ids.index(qid) - except ValueError: - return -1 - return q - - -POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist') - - -DEFAULT_OUTTMPL = { - 'default': '%(title)s [%(id)s].%(ext)s', - 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s', -} -OUTTMPL_TYPES = { - 'chapter': None, - 'subtitle': None, - 'thumbnail': None, - 'description': 'description', - 'annotation': 'annotations.xml', - 'infojson': 'info.json', - 'link': None, - 'pl_video': None, - 'pl_thumbnail': None, - 'pl_description': 'description', - 'pl_infojson': 'info.json', -} - -# As of [1] format syntax is: -# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type -# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting -STR_FORMAT_RE_TMPL = r'''(?x) - (?<!%)(?P<prefix>(?:%%)*) - % - (?P<has_key>\((?P<key>{0})\))? - (?P<format> - (?P<conversion>[#0\-+ ]+)? - (?P<min_width>\d+)? - (?P<precision>\.\d+)? - (?P<len_mod>[hlL])? # unused in python - {1} # conversion type - ) -''' - - -STR_FORMAT_TYPES = 'diouxXeEfFgGcrs' - - -def limit_length(s, length): - """ Add ellipses to overly long strings """ - if s is None: - return None - ELLIPSES = '...' - if len(s) > length: - return s[:length - len(ELLIPSES)] + ELLIPSES - return s - - -def version_tuple(v): - return tuple(int(e) for e in re.split(r'[-.]', v)) - - -def is_outdated_version(version, limit, assume_new=True): - if not version: - return not assume_new - try: - return version_tuple(version) < version_tuple(limit) - except ValueError: - return not assume_new - - -def ytdl_is_updateable(): - """ Returns if yt-dlp can be updated with -U """ - - from .update import is_non_updateable - - return not is_non_updateable() - - -def args_to_str(args): - # Get a short string representation for a subprocess command - return ' '.join(compat_shlex_quote(a) for a in args) - - -def error_to_compat_str(err): - return str(err) - - -def error_to_str(err): - return f'{type(err).__name__}: {err}' - - -def mimetype2ext(mt, default=NO_DEFAULT): - if not isinstance(mt, str): - if default is not NO_DEFAULT: - return default - return None - - MAP = { - # video - '3gpp': '3gp', - 'mp2t': 'ts', - 'mp4': 'mp4', - 'mpeg': 'mpeg', - 'mpegurl': 'm3u8', - 'quicktime': 'mov', - 'webm': 'webm', - 'vp9': 'vp9', - 'x-flv': 'flv', - 'x-m4v': 'm4v', - 'x-matroska': 'mkv', - 'x-mng': 'mng', - 'x-mp4-fragmented': 'mp4', - 'x-ms-asf': 'asf', - 'x-ms-wmv': 'wmv', - 'x-msvideo': 'avi', - - # application (streaming playlists) - 'dash+xml': 'mpd', - 'f4m+xml': 'f4m', - 'hds+xml': 'f4m', - 'vnd.apple.mpegurl': 'm3u8', - 'vnd.ms-sstr+xml': 'ism', - 'x-mpegurl': 'm3u8', - - # audio - 'audio/mp4': 'm4a', - # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. - # Using .mp3 as it's the most popular one - 'audio/mpeg': 'mp3', - 'audio/webm': 'webm', - 'audio/x-matroska': 'mka', - 'audio/x-mpegurl': 'm3u', - 'midi': 'mid', - 'ogg': 'ogg', - 'wav': 'wav', - 'wave': 'wav', - 'x-aac': 'aac', - 'x-flac': 'flac', - 'x-m4a': 'm4a', - 'x-realaudio': 'ra', - 'x-wav': 'wav', - - # image - 'avif': 'avif', - 'bmp': 'bmp', - 'gif': 'gif', - 'jpeg': 'jpg', - 'png': 'png', - 'svg+xml': 'svg', - 'tiff': 'tif', - 'vnd.wap.wbmp': 'wbmp', - 'webp': 'webp', - 'x-icon': 'ico', - 'x-jng': 'jng', - 'x-ms-bmp': 'bmp', - - # caption - 'filmstrip+json': 'fs', - 'smptett+xml': 'tt', - 'ttaf+xml': 'dfxp', - 'ttml+xml': 'ttml', - 'x-ms-sami': 'sami', - - # misc - 'gzip': 'gz', - 'json': 'json', - 'xml': 'xml', - 'zip': 'zip', - } - - mimetype = mt.partition(';')[0].strip().lower() - _, _, subtype = mimetype.rpartition('/') - - ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) - if ext: - return ext - elif default is not NO_DEFAULT: - return default - return subtype.replace('+', '.') - - -def ext2mimetype(ext_or_url): - if not ext_or_url: - return None - if '.' not in ext_or_url: - ext_or_url = f'file.{ext_or_url}' - return mimetypes.guess_type(ext_or_url)[0] - - -def parse_codecs(codecs_str): - # http://tools.ietf.org/html/rfc6381 - if not codecs_str: - return {} - split_codecs = list(filter(None, map( - str.strip, codecs_str.strip().strip(',').split(',')))) - vcodec, acodec, scodec, hdr = None, None, None, None - for full_codec in split_codecs: - parts = re.sub(r'0+(?=\d)', '', full_codec).split('.') - if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', - 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'): - if vcodec: - continue - vcodec = full_codec - if parts[0] in ('dvh1', 'dvhe'): - hdr = 'DV' - elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10': - hdr = 'HDR10' - elif parts[:2] == ['vp9', '2']: - hdr = 'HDR10' - elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4', - 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): - acodec = acodec or full_codec - elif parts[0] in ('stpp', 'wvtt'): - scodec = scodec or full_codec - else: - write_string(f'WARNING: Unknown codec {full_codec}\n') - if vcodec or acodec or scodec: - return { - 'vcodec': vcodec or 'none', - 'acodec': acodec or 'none', - 'dynamic_range': hdr, - **({'scodec': scodec} if scodec is not None else {}), - } - elif len(split_codecs) == 2: - return { - 'vcodec': split_codecs[0], - 'acodec': split_codecs[1], - } - return {} - - -def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): - assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts) - - allow_mkv = not preferences or 'mkv' in preferences - - if allow_mkv and max(len(acodecs), len(vcodecs)) > 1: - return 'mkv' # TODO: any other format allows this? - - # TODO: All codecs supported by parse_codecs isn't handled here - COMPATIBLE_CODECS = { - 'mp4': { - 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd) - 'h264', 'aacl', 'ec-3', # Set in ISM - }, - 'webm': { - 'av1', 'vp9', 'vp8', 'opus', 'vrbs', - 'vp9x', 'vp8x', # in the webm spec - }, - } - - sanitize_codec = functools.partial( - try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower()) - vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs) - - for ext in preferences or COMPATIBLE_CODECS.keys(): - codec_set = COMPATIBLE_CODECS.get(ext, set()) - if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)): - return ext - - COMPATIBLE_EXTS = ( - {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'}, - {'webm', 'weba'}, - ) - for ext in preferences or vexts: - current_exts = {ext, *vexts, *aexts} - if ext == 'mkv' or current_exts == {ext} or any( - ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS): - return ext - return 'mkv' if allow_mkv else preferences[-1] - - -def urlhandle_detect_ext(url_handle, default=NO_DEFAULT): - getheader = url_handle.headers.get - - cd = getheader('Content-Disposition') - if cd: - m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd) - if m: - e = determine_ext(m.group('filename'), default_ext=None) - if e: - return e - - meta_ext = getheader('x-amz-meta-name') - if meta_ext: - e = meta_ext.rpartition('.')[2] - if e: - return e - - return mimetype2ext(getheader('Content-Type'), default=default) - - -def encode_data_uri(data, mime_type): - return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) - - -def age_restricted(content_limit, age_limit): - """ Returns True iff the content should be blocked """ - - if age_limit is None: # No limit set - return False - if content_limit is None: - return False # Content available for everyone - return age_limit < content_limit - - -# List of known byte-order-marks (BOM) -BOMS = [ - (b'\xef\xbb\xbf', 'utf-8'), - (b'\x00\x00\xfe\xff', 'utf-32-be'), - (b'\xff\xfe\x00\x00', 'utf-32-le'), - (b'\xff\xfe', 'utf-16-le'), - (b'\xfe\xff', 'utf-16-be'), -] - - -def is_html(first_bytes): - """ Detect whether a file contains HTML by examining its first bytes. """ - - encoding = 'utf-8' - for bom, enc in BOMS: - while first_bytes.startswith(bom): - encoding, first_bytes = enc, first_bytes[len(bom):] - - return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) - - -def determine_protocol(info_dict): - protocol = info_dict.get('protocol') - if protocol is not None: - return protocol - - url = sanitize_url(info_dict['url']) - if url.startswith('rtmp'): - return 'rtmp' - elif url.startswith('mms'): - return 'mms' - elif url.startswith('rtsp'): - return 'rtsp' - - ext = determine_ext(url) - if ext == 'm3u8': - return 'm3u8' if info_dict.get('is_live') else 'm3u8_native' - elif ext == 'f4m': - return 'f4m' - - return urllib.parse.urlparse(url).scheme - - -def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False): - """ Render a list of rows, each as a list of values. - Text after a \t will be right aligned """ - def width(string): - return len(remove_terminal_sequences(string).replace('\t', '')) - - def get_max_lens(table): - return [max(width(str(v)) for v in col) for col in zip(*table)] - - def filter_using_list(row, filterArray): - return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take] - - max_lens = get_max_lens(data) if hide_empty else [] - header_row = filter_using_list(header_row, max_lens) - data = [filter_using_list(row, max_lens) for row in data] - - table = [header_row] + data - max_lens = get_max_lens(table) - extra_gap += 1 - if delim: - table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data - table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter - for row in table: - for pos, text in enumerate(map(str, row)): - if '\t' in text: - row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap - else: - row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap) - ret = '\n'.join(''.join(row).rstrip() for row in table) - return ret - - -def _match_one(filter_part, dct, incomplete): - # TODO: Generalize code with YoutubeDL._build_format_filter - STRING_OPERATORS = { - '*=': operator.contains, - '^=': lambda attr, value: attr.startswith(value), - '$=': lambda attr, value: attr.endswith(value), - '~=': lambda attr, value: re.search(value, attr), - } - COMPARISON_OPERATORS = { - **STRING_OPERATORS, - '<=': operator.le, # "<=" must be defined above "<" - '<': operator.lt, - '>=': operator.ge, - '>': operator.gt, - '=': operator.eq, - } - - if isinstance(incomplete, bool): - is_incomplete = lambda _: incomplete - else: - is_incomplete = lambda k: k in incomplete - - operator_rex = re.compile(r'''(?x) - (?P<key>[a-z_]+) - \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* - (?: - (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)| - (?P<strval>.+?) - ) - ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) - m = operator_rex.fullmatch(filter_part.strip()) - if m: - m = m.groupdict() - unnegated_op = COMPARISON_OPERATORS[m['op']] - if m['negation']: - op = lambda attr, value: not unnegated_op(attr, value) - else: - op = unnegated_op - comparison_value = m['quotedstrval'] or m['strval'] or m['intval'] - if m['quote']: - comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote']) - actual_value = dct.get(m['key']) - numeric_comparison = None - if isinstance(actual_value, (int, float)): - # If the original field is a string and matching comparisonvalue is - # a number we should respect the origin of the original field - # and process comparison value as a string (see - # https://github.com/ytdl-org/youtube-dl/issues/11082) - try: - numeric_comparison = int(comparison_value) - except ValueError: - numeric_comparison = parse_filesize(comparison_value) - if numeric_comparison is None: - numeric_comparison = parse_filesize(f'{comparison_value}B') - if numeric_comparison is None: - numeric_comparison = parse_duration(comparison_value) - if numeric_comparison is not None and m['op'] in STRING_OPERATORS: - raise ValueError('Operator %s only supports string values!' % m['op']) - if actual_value is None: - return is_incomplete(m['key']) or m['none_inclusive'] - return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison) - - UNARY_OPERATORS = { - '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), - '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), - } - operator_rex = re.compile(r'''(?x) - (?P<op>%s)\s*(?P<key>[a-z_]+) - ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) - m = operator_rex.fullmatch(filter_part.strip()) - if m: - op = UNARY_OPERATORS[m.group('op')] - actual_value = dct.get(m.group('key')) - if is_incomplete(m.group('key')) and actual_value is None: - return True - return op(actual_value) - - raise ValueError('Invalid filter part %r' % filter_part) - - -def match_str(filter_str, dct, incomplete=False): - """ Filter a dictionary with a simple string syntax. - @returns Whether the filter passes - @param incomplete Set of keys that is expected to be missing from dct. - Can be True/False to indicate all/none of the keys may be missing. - All conditions on incomplete keys pass if the key is missing - """ - return all( - _match_one(filter_part.replace(r'\&', '&'), dct, incomplete) - for filter_part in re.split(r'(?<!\\)&', filter_str)) - - -def match_filter_func(filters, breaking_filters=None): - if not filters and not breaking_filters: - return None - breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None) - filters = set(variadic(filters or [])) - - interactive = '-' in filters - if interactive: - filters.remove('-') - - def _match_func(info_dict, incomplete=False): - ret = breaking_filters(info_dict, incomplete) - if ret is not None: - raise RejectedVideoReached(ret) - - if not filters or any(match_str(f, info_dict, incomplete) for f in filters): - return NO_DEFAULT if interactive and not incomplete else None - else: - video_title = info_dict.get('title') or info_dict.get('id') or 'entry' - filter_str = ') | ('.join(map(str.strip, filters)) - return f'{video_title} does not pass filter ({filter_str}), skipping ..' - return _match_func - - -class download_range_func: - def __init__(self, chapters, ranges): - self.chapters, self.ranges = chapters, ranges - - def __call__(self, info_dict, ydl): - if not self.ranges and not self.chapters: - yield {} - - warning = ('There are no chapters matching the regex' if info_dict.get('chapters') - else 'Cannot match chapters since chapter information is unavailable') - for regex in self.chapters or []: - for i, chapter in enumerate(info_dict.get('chapters') or []): - if re.search(regex, chapter['title']): - warning = None - yield {**chapter, 'index': i} - if self.chapters and warning: - ydl.to_screen(f'[info] {info_dict["id"]}: {warning}') - - yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or []) - - def __eq__(self, other): - return (isinstance(other, download_range_func) - and self.chapters == other.chapters and self.ranges == other.ranges) - - def __repr__(self): - return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})' - - -def parse_dfxp_time_expr(time_expr): - if not time_expr: - return - - mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr) - if mobj: - return float(mobj.group('time_offset')) - - mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr) - if mobj: - return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.')) - - -def srt_subtitles_timecode(seconds): - return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000) - - -def ass_subtitles_timecode(seconds): - time = timetuple_from_msec(seconds * 1000) - return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10) - - -def dfxp2srt(dfxp_data): - ''' - @param dfxp_data A bytes-like object containing DFXP data - @returns A unicode object containing converted SRT data - ''' - LEGACY_NAMESPACES = ( - (b'http://www.w3.org/ns/ttml', [ - b'http://www.w3.org/2004/11/ttaf1', - b'http://www.w3.org/2006/04/ttaf1', - b'http://www.w3.org/2006/10/ttaf1', - ]), - (b'http://www.w3.org/ns/ttml#styling', [ - b'http://www.w3.org/ns/ttml#style', - ]), - ) - - SUPPORTED_STYLING = [ - 'color', - 'fontFamily', - 'fontSize', - 'fontStyle', - 'fontWeight', - 'textDecoration' - ] - - _x = functools.partial(xpath_with_ns, ns_map={ - 'xml': 'http://www.w3.org/XML/1998/namespace', - 'ttml': 'http://www.w3.org/ns/ttml', - 'tts': 'http://www.w3.org/ns/ttml#styling', - }) - - styles = {} - default_style = {} - - class TTMLPElementParser: - _out = '' - _unclosed_elements = [] - _applied_styles = [] - - def start(self, tag, attrib): - if tag in (_x('ttml:br'), 'br'): - self._out += '\n' - else: - unclosed_elements = [] - style = {} - element_style_id = attrib.get('style') - if default_style: - style.update(default_style) - if element_style_id: - style.update(styles.get(element_style_id, {})) - for prop in SUPPORTED_STYLING: - prop_val = attrib.get(_x('tts:' + prop)) - if prop_val: - style[prop] = prop_val - if style: - font = '' - for k, v in sorted(style.items()): - if self._applied_styles and self._applied_styles[-1].get(k) == v: - continue - if k == 'color': - font += ' color="%s"' % v - elif k == 'fontSize': - font += ' size="%s"' % v - elif k == 'fontFamily': - font += ' face="%s"' % v - elif k == 'fontWeight' and v == 'bold': - self._out += '<b>' - unclosed_elements.append('b') - elif k == 'fontStyle' and v == 'italic': - self._out += '<i>' - unclosed_elements.append('i') - elif k == 'textDecoration' and v == 'underline': - self._out += '<u>' - unclosed_elements.append('u') - if font: - self._out += '<font' + font + '>' - unclosed_elements.append('font') - applied_style = {} - if self._applied_styles: - applied_style.update(self._applied_styles[-1]) - applied_style.update(style) - self._applied_styles.append(applied_style) - self._unclosed_elements.append(unclosed_elements) - - def end(self, tag): - if tag not in (_x('ttml:br'), 'br'): - unclosed_elements = self._unclosed_elements.pop() - for element in reversed(unclosed_elements): - self._out += '</%s>' % element - if unclosed_elements and self._applied_styles: - self._applied_styles.pop() - - def data(self, data): - self._out += data - - def close(self): - return self._out.strip() - - # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870 - # This will not trigger false positives since only UTF-8 text is being replaced - dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'') - - def parse_node(node): - target = TTMLPElementParser() - parser = xml.etree.ElementTree.XMLParser(target=target) - parser.feed(xml.etree.ElementTree.tostring(node)) - return parser.close() - - for k, v in LEGACY_NAMESPACES: - for ns in v: - dfxp_data = dfxp_data.replace(ns, k) - - dfxp = compat_etree_fromstring(dfxp_data) - out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') - - if not paras: - raise ValueError('Invalid dfxp/TTML subtitle') - - repeat = False - while True: - for style in dfxp.findall(_x('.//ttml:style')): - style_id = style.get('id') or style.get(_x('xml:id')) - if not style_id: - continue - parent_style_id = style.get('style') - if parent_style_id: - if parent_style_id not in styles: - repeat = True - continue - styles[style_id] = styles[parent_style_id].copy() - for prop in SUPPORTED_STYLING: - prop_val = style.get(_x('tts:' + prop)) - if prop_val: - styles.setdefault(style_id, {})[prop] = prop_val - if repeat: - repeat = False - else: - break - - for p in ('body', 'div'): - ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) - if ele is None: - continue - style = styles.get(ele.get('style')) - if not style: - continue - default_style.update(style) - - for para, index in zip(paras, itertools.count(1)): - begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) - end_time = parse_dfxp_time_expr(para.attrib.get('end')) - dur = parse_dfxp_time_expr(para.attrib.get('dur')) - if begin_time is None: - continue - if not end_time: - if not dur: - continue - end_time = begin_time + dur - out.append('%d\n%s --> %s\n%s\n\n' % ( - index, - srt_subtitles_timecode(begin_time), - srt_subtitles_timecode(end_time), - parse_node(para))) - - return ''.join(out) - - -def cli_option(params, command_option, param, separator=None): - param = params.get(param) - return ([] if param is None - else [command_option, str(param)] if separator is None - else [f'{command_option}{separator}{param}']) - - -def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): - param = params.get(param) - assert param in (True, False, None) - return cli_option({True: true_value, False: false_value}, command_option, param, separator) - - -def cli_valueless_option(params, command_option, param, expected_value=True): - return [command_option] if params.get(param) == expected_value else [] - - -def cli_configuration_args(argdict, keys, default=[], use_compat=True): - if isinstance(argdict, (list, tuple)): # for backward compatibility - if use_compat: - return argdict - else: - argdict = None - if argdict is None: - return default - assert isinstance(argdict, dict) - - assert isinstance(keys, (list, tuple)) - for key_list in keys: - arg_list = list(filter( - lambda x: x is not None, - [argdict.get(key.lower()) for key in variadic(key_list)])) - if arg_list: - return [arg for args in arg_list for arg in args] - return default - - -def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True): - main_key, exe = main_key.lower(), exe.lower() - root_key = exe if main_key == exe else f'{main_key}+{exe}' - keys = [f'{root_key}{k}' for k in (keys or [''])] - if root_key in keys: - if main_key != exe: - keys.append((main_key, exe)) - keys.append('default') - else: - use_compat = False - return cli_configuration_args(argdict, keys, default, use_compat) - - -class ISO639Utils: - # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt - _lang_map = { - 'aa': 'aar', - 'ab': 'abk', - 'ae': 'ave', - 'af': 'afr', - 'ak': 'aka', - 'am': 'amh', - 'an': 'arg', - 'ar': 'ara', - 'as': 'asm', - 'av': 'ava', - 'ay': 'aym', - 'az': 'aze', - 'ba': 'bak', - 'be': 'bel', - 'bg': 'bul', - 'bh': 'bih', - 'bi': 'bis', - 'bm': 'bam', - 'bn': 'ben', - 'bo': 'bod', - 'br': 'bre', - 'bs': 'bos', - 'ca': 'cat', - 'ce': 'che', - 'ch': 'cha', - 'co': 'cos', - 'cr': 'cre', - 'cs': 'ces', - 'cu': 'chu', - 'cv': 'chv', - 'cy': 'cym', - 'da': 'dan', - 'de': 'deu', - 'dv': 'div', - 'dz': 'dzo', - 'ee': 'ewe', - 'el': 'ell', - 'en': 'eng', - 'eo': 'epo', - 'es': 'spa', - 'et': 'est', - 'eu': 'eus', - 'fa': 'fas', - 'ff': 'ful', - 'fi': 'fin', - 'fj': 'fij', - 'fo': 'fao', - 'fr': 'fra', - 'fy': 'fry', - 'ga': 'gle', - 'gd': 'gla', - 'gl': 'glg', - 'gn': 'grn', - 'gu': 'guj', - 'gv': 'glv', - 'ha': 'hau', - 'he': 'heb', - 'iw': 'heb', # Replaced by he in 1989 revision - 'hi': 'hin', - 'ho': 'hmo', - 'hr': 'hrv', - 'ht': 'hat', - 'hu': 'hun', - 'hy': 'hye', - 'hz': 'her', - 'ia': 'ina', - 'id': 'ind', - 'in': 'ind', # Replaced by id in 1989 revision - 'ie': 'ile', - 'ig': 'ibo', - 'ii': 'iii', - 'ik': 'ipk', - 'io': 'ido', - 'is': 'isl', - 'it': 'ita', - 'iu': 'iku', - 'ja': 'jpn', - 'jv': 'jav', - 'ka': 'kat', - 'kg': 'kon', - 'ki': 'kik', - 'kj': 'kua', - 'kk': 'kaz', - 'kl': 'kal', - 'km': 'khm', - 'kn': 'kan', - 'ko': 'kor', - 'kr': 'kau', - 'ks': 'kas', - 'ku': 'kur', - 'kv': 'kom', - 'kw': 'cor', - 'ky': 'kir', - 'la': 'lat', - 'lb': 'ltz', - 'lg': 'lug', - 'li': 'lim', - 'ln': 'lin', - 'lo': 'lao', - 'lt': 'lit', - 'lu': 'lub', - 'lv': 'lav', - 'mg': 'mlg', - 'mh': 'mah', - 'mi': 'mri', - 'mk': 'mkd', - 'ml': 'mal', - 'mn': 'mon', - 'mr': 'mar', - 'ms': 'msa', - 'mt': 'mlt', - 'my': 'mya', - 'na': 'nau', - 'nb': 'nob', - 'nd': 'nde', - 'ne': 'nep', - 'ng': 'ndo', - 'nl': 'nld', - 'nn': 'nno', - 'no': 'nor', - 'nr': 'nbl', - 'nv': 'nav', - 'ny': 'nya', - 'oc': 'oci', - 'oj': 'oji', - 'om': 'orm', - 'or': 'ori', - 'os': 'oss', - 'pa': 'pan', - 'pi': 'pli', - 'pl': 'pol', - 'ps': 'pus', - 'pt': 'por', - 'qu': 'que', - 'rm': 'roh', - 'rn': 'run', - 'ro': 'ron', - 'ru': 'rus', - 'rw': 'kin', - 'sa': 'san', - 'sc': 'srd', - 'sd': 'snd', - 'se': 'sme', - 'sg': 'sag', - 'si': 'sin', - 'sk': 'slk', - 'sl': 'slv', - 'sm': 'smo', - 'sn': 'sna', - 'so': 'som', - 'sq': 'sqi', - 'sr': 'srp', - 'ss': 'ssw', - 'st': 'sot', - 'su': 'sun', - 'sv': 'swe', - 'sw': 'swa', - 'ta': 'tam', - 'te': 'tel', - 'tg': 'tgk', - 'th': 'tha', - 'ti': 'tir', - 'tk': 'tuk', - 'tl': 'tgl', - 'tn': 'tsn', - 'to': 'ton', - 'tr': 'tur', - 'ts': 'tso', - 'tt': 'tat', - 'tw': 'twi', - 'ty': 'tah', - 'ug': 'uig', - 'uk': 'ukr', - 'ur': 'urd', - 'uz': 'uzb', - 've': 'ven', - 'vi': 'vie', - 'vo': 'vol', - 'wa': 'wln', - 'wo': 'wol', - 'xh': 'xho', - 'yi': 'yid', - 'ji': 'yid', # Replaced by yi in 1989 revision - 'yo': 'yor', - 'za': 'zha', - 'zh': 'zho', - 'zu': 'zul', - } - - @classmethod - def short2long(cls, code): - """Convert language code from ISO 639-1 to ISO 639-2/T""" - return cls._lang_map.get(code[:2]) - - @classmethod - def long2short(cls, code): - """Convert language code from ISO 639-2/T to ISO 639-1""" - for short_name, long_name in cls._lang_map.items(): - if long_name == code: - return short_name - - -class ISO3166Utils: - # From http://data.okfn.org/data/core/country-list - _country_map = { - 'AF': 'Afghanistan', - 'AX': 'Åland Islands', - 'AL': 'Albania', - 'DZ': 'Algeria', - 'AS': 'American Samoa', - 'AD': 'Andorra', - 'AO': 'Angola', - 'AI': 'Anguilla', - 'AQ': 'Antarctica', - 'AG': 'Antigua and Barbuda', - 'AR': 'Argentina', - 'AM': 'Armenia', - 'AW': 'Aruba', - 'AU': 'Australia', - 'AT': 'Austria', - 'AZ': 'Azerbaijan', - 'BS': 'Bahamas', - 'BH': 'Bahrain', - 'BD': 'Bangladesh', - 'BB': 'Barbados', - 'BY': 'Belarus', - 'BE': 'Belgium', - 'BZ': 'Belize', - 'BJ': 'Benin', - 'BM': 'Bermuda', - 'BT': 'Bhutan', - 'BO': 'Bolivia, Plurinational State of', - 'BQ': 'Bonaire, Sint Eustatius and Saba', - 'BA': 'Bosnia and Herzegovina', - 'BW': 'Botswana', - 'BV': 'Bouvet Island', - 'BR': 'Brazil', - 'IO': 'British Indian Ocean Territory', - 'BN': 'Brunei Darussalam', - 'BG': 'Bulgaria', - 'BF': 'Burkina Faso', - 'BI': 'Burundi', - 'KH': 'Cambodia', - 'CM': 'Cameroon', - 'CA': 'Canada', - 'CV': 'Cape Verde', - 'KY': 'Cayman Islands', - 'CF': 'Central African Republic', - 'TD': 'Chad', - 'CL': 'Chile', - 'CN': 'China', - 'CX': 'Christmas Island', - 'CC': 'Cocos (Keeling) Islands', - 'CO': 'Colombia', - 'KM': 'Comoros', - 'CG': 'Congo', - 'CD': 'Congo, the Democratic Republic of the', - 'CK': 'Cook Islands', - 'CR': 'Costa Rica', - 'CI': 'Côte d\'Ivoire', - 'HR': 'Croatia', - 'CU': 'Cuba', - 'CW': 'Curaçao', - 'CY': 'Cyprus', - 'CZ': 'Czech Republic', - 'DK': 'Denmark', - 'DJ': 'Djibouti', - 'DM': 'Dominica', - 'DO': 'Dominican Republic', - 'EC': 'Ecuador', - 'EG': 'Egypt', - 'SV': 'El Salvador', - 'GQ': 'Equatorial Guinea', - 'ER': 'Eritrea', - 'EE': 'Estonia', - 'ET': 'Ethiopia', - 'FK': 'Falkland Islands (Malvinas)', - 'FO': 'Faroe Islands', - 'FJ': 'Fiji', - 'FI': 'Finland', - 'FR': 'France', - 'GF': 'French Guiana', - 'PF': 'French Polynesia', - 'TF': 'French Southern Territories', - 'GA': 'Gabon', - 'GM': 'Gambia', - 'GE': 'Georgia', - 'DE': 'Germany', - 'GH': 'Ghana', - 'GI': 'Gibraltar', - 'GR': 'Greece', - 'GL': 'Greenland', - 'GD': 'Grenada', - 'GP': 'Guadeloupe', - 'GU': 'Guam', - 'GT': 'Guatemala', - 'GG': 'Guernsey', - 'GN': 'Guinea', - 'GW': 'Guinea-Bissau', - 'GY': 'Guyana', - 'HT': 'Haiti', - 'HM': 'Heard Island and McDonald Islands', - 'VA': 'Holy See (Vatican City State)', - 'HN': 'Honduras', - 'HK': 'Hong Kong', - 'HU': 'Hungary', - 'IS': 'Iceland', - 'IN': 'India', - 'ID': 'Indonesia', - 'IR': 'Iran, Islamic Republic of', - 'IQ': 'Iraq', - 'IE': 'Ireland', - 'IM': 'Isle of Man', - 'IL': 'Israel', - 'IT': 'Italy', - 'JM': 'Jamaica', - 'JP': 'Japan', - 'JE': 'Jersey', - 'JO': 'Jordan', - 'KZ': 'Kazakhstan', - 'KE': 'Kenya', - 'KI': 'Kiribati', - 'KP': 'Korea, Democratic People\'s Republic of', - 'KR': 'Korea, Republic of', - 'KW': 'Kuwait', - 'KG': 'Kyrgyzstan', - 'LA': 'Lao People\'s Democratic Republic', - 'LV': 'Latvia', - 'LB': 'Lebanon', - 'LS': 'Lesotho', - 'LR': 'Liberia', - 'LY': 'Libya', - 'LI': 'Liechtenstein', - 'LT': 'Lithuania', - 'LU': 'Luxembourg', - 'MO': 'Macao', - 'MK': 'Macedonia, the Former Yugoslav Republic of', - 'MG': 'Madagascar', - 'MW': 'Malawi', - 'MY': 'Malaysia', - 'MV': 'Maldives', - 'ML': 'Mali', - 'MT': 'Malta', - 'MH': 'Marshall Islands', - 'MQ': 'Martinique', - 'MR': 'Mauritania', - 'MU': 'Mauritius', - 'YT': 'Mayotte', - 'MX': 'Mexico', - 'FM': 'Micronesia, Federated States of', - 'MD': 'Moldova, Republic of', - 'MC': 'Monaco', - 'MN': 'Mongolia', - 'ME': 'Montenegro', - 'MS': 'Montserrat', - 'MA': 'Morocco', - 'MZ': 'Mozambique', - 'MM': 'Myanmar', - 'NA': 'Namibia', - 'NR': 'Nauru', - 'NP': 'Nepal', - 'NL': 'Netherlands', - 'NC': 'New Caledonia', - 'NZ': 'New Zealand', - 'NI': 'Nicaragua', - 'NE': 'Niger', - 'NG': 'Nigeria', - 'NU': 'Niue', - 'NF': 'Norfolk Island', - 'MP': 'Northern Mariana Islands', - 'NO': 'Norway', - 'OM': 'Oman', - 'PK': 'Pakistan', - 'PW': 'Palau', - 'PS': 'Palestine, State of', - 'PA': 'Panama', - 'PG': 'Papua New Guinea', - 'PY': 'Paraguay', - 'PE': 'Peru', - 'PH': 'Philippines', - 'PN': 'Pitcairn', - 'PL': 'Poland', - 'PT': 'Portugal', - 'PR': 'Puerto Rico', - 'QA': 'Qatar', - 'RE': 'Réunion', - 'RO': 'Romania', - 'RU': 'Russian Federation', - 'RW': 'Rwanda', - 'BL': 'Saint Barthélemy', - 'SH': 'Saint Helena, Ascension and Tristan da Cunha', - 'KN': 'Saint Kitts and Nevis', - 'LC': 'Saint Lucia', - 'MF': 'Saint Martin (French part)', - 'PM': 'Saint Pierre and Miquelon', - 'VC': 'Saint Vincent and the Grenadines', - 'WS': 'Samoa', - 'SM': 'San Marino', - 'ST': 'Sao Tome and Principe', - 'SA': 'Saudi Arabia', - 'SN': 'Senegal', - 'RS': 'Serbia', - 'SC': 'Seychelles', - 'SL': 'Sierra Leone', - 'SG': 'Singapore', - 'SX': 'Sint Maarten (Dutch part)', - 'SK': 'Slovakia', - 'SI': 'Slovenia', - 'SB': 'Solomon Islands', - 'SO': 'Somalia', - 'ZA': 'South Africa', - 'GS': 'South Georgia and the South Sandwich Islands', - 'SS': 'South Sudan', - 'ES': 'Spain', - 'LK': 'Sri Lanka', - 'SD': 'Sudan', - 'SR': 'Suriname', - 'SJ': 'Svalbard and Jan Mayen', - 'SZ': 'Swaziland', - 'SE': 'Sweden', - 'CH': 'Switzerland', - 'SY': 'Syrian Arab Republic', - 'TW': 'Taiwan, Province of China', - 'TJ': 'Tajikistan', - 'TZ': 'Tanzania, United Republic of', - 'TH': 'Thailand', - 'TL': 'Timor-Leste', - 'TG': 'Togo', - 'TK': 'Tokelau', - 'TO': 'Tonga', - 'TT': 'Trinidad and Tobago', - 'TN': 'Tunisia', - 'TR': 'Turkey', - 'TM': 'Turkmenistan', - 'TC': 'Turks and Caicos Islands', - 'TV': 'Tuvalu', - 'UG': 'Uganda', - 'UA': 'Ukraine', - 'AE': 'United Arab Emirates', - 'GB': 'United Kingdom', - 'US': 'United States', - 'UM': 'United States Minor Outlying Islands', - 'UY': 'Uruguay', - 'UZ': 'Uzbekistan', - 'VU': 'Vanuatu', - 'VE': 'Venezuela, Bolivarian Republic of', - 'VN': 'Viet Nam', - 'VG': 'Virgin Islands, British', - 'VI': 'Virgin Islands, U.S.', - 'WF': 'Wallis and Futuna', - 'EH': 'Western Sahara', - 'YE': 'Yemen', - 'ZM': 'Zambia', - 'ZW': 'Zimbabwe', - # Not ISO 3166 codes, but used for IP blocks - 'AP': 'Asia/Pacific Region', - 'EU': 'Europe', - } - - @classmethod - def short2full(cls, code): - """Convert an ISO 3166-2 country code to the corresponding full name""" - return cls._country_map.get(code.upper()) - - -class GeoUtils: - # Major IPv4 address blocks per country - _country_ip_map = { - 'AD': '46.172.224.0/19', - 'AE': '94.200.0.0/13', - 'AF': '149.54.0.0/17', - 'AG': '209.59.64.0/18', - 'AI': '204.14.248.0/21', - 'AL': '46.99.0.0/16', - 'AM': '46.70.0.0/15', - 'AO': '105.168.0.0/13', - 'AP': '182.50.184.0/21', - 'AQ': '23.154.160.0/24', - 'AR': '181.0.0.0/12', - 'AS': '202.70.112.0/20', - 'AT': '77.116.0.0/14', - 'AU': '1.128.0.0/11', - 'AW': '181.41.0.0/18', - 'AX': '185.217.4.0/22', - 'AZ': '5.197.0.0/16', - 'BA': '31.176.128.0/17', - 'BB': '65.48.128.0/17', - 'BD': '114.130.0.0/16', - 'BE': '57.0.0.0/8', - 'BF': '102.178.0.0/15', - 'BG': '95.42.0.0/15', - 'BH': '37.131.0.0/17', - 'BI': '154.117.192.0/18', - 'BJ': '137.255.0.0/16', - 'BL': '185.212.72.0/23', - 'BM': '196.12.64.0/18', - 'BN': '156.31.0.0/16', - 'BO': '161.56.0.0/16', - 'BQ': '161.0.80.0/20', - 'BR': '191.128.0.0/12', - 'BS': '24.51.64.0/18', - 'BT': '119.2.96.0/19', - 'BW': '168.167.0.0/16', - 'BY': '178.120.0.0/13', - 'BZ': '179.42.192.0/18', - 'CA': '99.224.0.0/11', - 'CD': '41.243.0.0/16', - 'CF': '197.242.176.0/21', - 'CG': '160.113.0.0/16', - 'CH': '85.0.0.0/13', - 'CI': '102.136.0.0/14', - 'CK': '202.65.32.0/19', - 'CL': '152.172.0.0/14', - 'CM': '102.244.0.0/14', - 'CN': '36.128.0.0/10', - 'CO': '181.240.0.0/12', - 'CR': '201.192.0.0/12', - 'CU': '152.206.0.0/15', - 'CV': '165.90.96.0/19', - 'CW': '190.88.128.0/17', - 'CY': '31.153.0.0/16', - 'CZ': '88.100.0.0/14', - 'DE': '53.0.0.0/8', - 'DJ': '197.241.0.0/17', - 'DK': '87.48.0.0/12', - 'DM': '192.243.48.0/20', - 'DO': '152.166.0.0/15', - 'DZ': '41.96.0.0/12', - 'EC': '186.68.0.0/15', - 'EE': '90.190.0.0/15', - 'EG': '156.160.0.0/11', - 'ER': '196.200.96.0/20', - 'ES': '88.0.0.0/11', - 'ET': '196.188.0.0/14', - 'EU': '2.16.0.0/13', - 'FI': '91.152.0.0/13', - 'FJ': '144.120.0.0/16', - 'FK': '80.73.208.0/21', - 'FM': '119.252.112.0/20', - 'FO': '88.85.32.0/19', - 'FR': '90.0.0.0/9', - 'GA': '41.158.0.0/15', - 'GB': '25.0.0.0/8', - 'GD': '74.122.88.0/21', - 'GE': '31.146.0.0/16', - 'GF': '161.22.64.0/18', - 'GG': '62.68.160.0/19', - 'GH': '154.160.0.0/12', - 'GI': '95.164.0.0/16', - 'GL': '88.83.0.0/19', - 'GM': '160.182.0.0/15', - 'GN': '197.149.192.0/18', - 'GP': '104.250.0.0/19', - 'GQ': '105.235.224.0/20', - 'GR': '94.64.0.0/13', - 'GT': '168.234.0.0/16', - 'GU': '168.123.0.0/16', - 'GW': '197.214.80.0/20', - 'GY': '181.41.64.0/18', - 'HK': '113.252.0.0/14', - 'HN': '181.210.0.0/16', - 'HR': '93.136.0.0/13', - 'HT': '148.102.128.0/17', - 'HU': '84.0.0.0/14', - 'ID': '39.192.0.0/10', - 'IE': '87.32.0.0/12', - 'IL': '79.176.0.0/13', - 'IM': '5.62.80.0/20', - 'IN': '117.192.0.0/10', - 'IO': '203.83.48.0/21', - 'IQ': '37.236.0.0/14', - 'IR': '2.176.0.0/12', - 'IS': '82.221.0.0/16', - 'IT': '79.0.0.0/10', - 'JE': '87.244.64.0/18', - 'JM': '72.27.0.0/17', - 'JO': '176.29.0.0/16', - 'JP': '133.0.0.0/8', - 'KE': '105.48.0.0/12', - 'KG': '158.181.128.0/17', - 'KH': '36.37.128.0/17', - 'KI': '103.25.140.0/22', - 'KM': '197.255.224.0/20', - 'KN': '198.167.192.0/19', - 'KP': '175.45.176.0/22', - 'KR': '175.192.0.0/10', - 'KW': '37.36.0.0/14', - 'KY': '64.96.0.0/15', - 'KZ': '2.72.0.0/13', - 'LA': '115.84.64.0/18', - 'LB': '178.135.0.0/16', - 'LC': '24.92.144.0/20', - 'LI': '82.117.0.0/19', - 'LK': '112.134.0.0/15', - 'LR': '102.183.0.0/16', - 'LS': '129.232.0.0/17', - 'LT': '78.56.0.0/13', - 'LU': '188.42.0.0/16', - 'LV': '46.109.0.0/16', - 'LY': '41.252.0.0/14', - 'MA': '105.128.0.0/11', - 'MC': '88.209.64.0/18', - 'MD': '37.246.0.0/16', - 'ME': '178.175.0.0/17', - 'MF': '74.112.232.0/21', - 'MG': '154.126.0.0/17', - 'MH': '117.103.88.0/21', - 'MK': '77.28.0.0/15', - 'ML': '154.118.128.0/18', - 'MM': '37.111.0.0/17', - 'MN': '49.0.128.0/17', - 'MO': '60.246.0.0/16', - 'MP': '202.88.64.0/20', - 'MQ': '109.203.224.0/19', - 'MR': '41.188.64.0/18', - 'MS': '208.90.112.0/22', - 'MT': '46.11.0.0/16', - 'MU': '105.16.0.0/12', - 'MV': '27.114.128.0/18', - 'MW': '102.70.0.0/15', - 'MX': '187.192.0.0/11', - 'MY': '175.136.0.0/13', - 'MZ': '197.218.0.0/15', - 'NA': '41.182.0.0/16', - 'NC': '101.101.0.0/18', - 'NE': '197.214.0.0/18', - 'NF': '203.17.240.0/22', - 'NG': '105.112.0.0/12', - 'NI': '186.76.0.0/15', - 'NL': '145.96.0.0/11', - 'NO': '84.208.0.0/13', - 'NP': '36.252.0.0/15', - 'NR': '203.98.224.0/19', - 'NU': '49.156.48.0/22', - 'NZ': '49.224.0.0/14', - 'OM': '5.36.0.0/15', - 'PA': '186.72.0.0/15', - 'PE': '186.160.0.0/14', - 'PF': '123.50.64.0/18', - 'PG': '124.240.192.0/19', - 'PH': '49.144.0.0/13', - 'PK': '39.32.0.0/11', - 'PL': '83.0.0.0/11', - 'PM': '70.36.0.0/20', - 'PR': '66.50.0.0/16', - 'PS': '188.161.0.0/16', - 'PT': '85.240.0.0/13', - 'PW': '202.124.224.0/20', - 'PY': '181.120.0.0/14', - 'QA': '37.210.0.0/15', - 'RE': '102.35.0.0/16', - 'RO': '79.112.0.0/13', - 'RS': '93.86.0.0/15', - 'RU': '5.136.0.0/13', - 'RW': '41.186.0.0/16', - 'SA': '188.48.0.0/13', - 'SB': '202.1.160.0/19', - 'SC': '154.192.0.0/11', - 'SD': '102.120.0.0/13', - 'SE': '78.64.0.0/12', - 'SG': '8.128.0.0/10', - 'SI': '188.196.0.0/14', - 'SK': '78.98.0.0/15', - 'SL': '102.143.0.0/17', - 'SM': '89.186.32.0/19', - 'SN': '41.82.0.0/15', - 'SO': '154.115.192.0/18', - 'SR': '186.179.128.0/17', - 'SS': '105.235.208.0/21', - 'ST': '197.159.160.0/19', - 'SV': '168.243.0.0/16', - 'SX': '190.102.0.0/20', - 'SY': '5.0.0.0/16', - 'SZ': '41.84.224.0/19', - 'TC': '65.255.48.0/20', - 'TD': '154.68.128.0/19', - 'TG': '196.168.0.0/14', - 'TH': '171.96.0.0/13', - 'TJ': '85.9.128.0/18', - 'TK': '27.96.24.0/21', - 'TL': '180.189.160.0/20', - 'TM': '95.85.96.0/19', - 'TN': '197.0.0.0/11', - 'TO': '175.176.144.0/21', - 'TR': '78.160.0.0/11', - 'TT': '186.44.0.0/15', - 'TV': '202.2.96.0/19', - 'TW': '120.96.0.0/11', - 'TZ': '156.156.0.0/14', - 'UA': '37.52.0.0/14', - 'UG': '102.80.0.0/13', - 'US': '6.0.0.0/8', - 'UY': '167.56.0.0/13', - 'UZ': '84.54.64.0/18', - 'VA': '212.77.0.0/19', - 'VC': '207.191.240.0/21', - 'VE': '186.88.0.0/13', - 'VG': '66.81.192.0/20', - 'VI': '146.226.0.0/16', - 'VN': '14.160.0.0/11', - 'VU': '202.80.32.0/20', - 'WF': '117.20.32.0/21', - 'WS': '202.4.32.0/19', - 'YE': '134.35.0.0/16', - 'YT': '41.242.116.0/22', - 'ZA': '41.0.0.0/11', - 'ZM': '102.144.0.0/13', - 'ZW': '102.177.192.0/18', - } - - @classmethod - def random_ipv4(cls, code_or_block): - if len(code_or_block) == 2: - block = cls._country_ip_map.get(code_or_block.upper()) - if not block: - return None - else: - block = code_or_block - addr, preflen = block.split('/') - addr_min = struct.unpack('!L', socket.inet_aton(addr))[0] - addr_max = addr_min | (0xffffffff >> int(preflen)) - return str(socket.inet_ntoa( - struct.pack('!L', random.randint(addr_min, addr_max)))) - - -class PerRequestProxyHandler(urllib.request.ProxyHandler): - def __init__(self, proxies=None): - # Set default handlers - for type in ('http', 'https'): - setattr(self, '%s_open' % type, - lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: - meth(r, proxy, type)) - urllib.request.ProxyHandler.__init__(self, proxies) - - def proxy_open(self, req, proxy, type): - req_proxy = req.headers.get('Ytdl-request-proxy') - if req_proxy is not None: - proxy = req_proxy - del req.headers['Ytdl-request-proxy'] - - if proxy == '__noproxy__': - return None # No Proxy - if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): - req.add_header('Ytdl-socks-proxy', proxy) - # yt-dlp's http/https handlers do wrapping the socket with socks - return None - return urllib.request.ProxyHandler.proxy_open( - self, req, proxy, type) - - -# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is -# released into Public Domain -# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 - -def long_to_bytes(n, blocksize=0): - """long_to_bytes(n:long, blocksize:int) : string - Convert a long integer to a byte string. - - If optional blocksize is given and greater than zero, pad the front of the - byte string with binary zeros so that the length is a multiple of - blocksize. - """ - # after much testing, this algorithm was deemed to be the fastest - s = b'' - n = int(n) - while n > 0: - s = struct.pack('>I', n & 0xffffffff) + s - n = n >> 32 - # strip off leading zeros - for i in range(len(s)): - if s[i] != b'\000'[0]: - break - else: - # only happens when n == 0 - s = b'\000' - i = 0 - s = s[i:] - # add back some pad bytes. this could be done more efficiently w.r.t. the - # de-padding being done above, but sigh... - if blocksize > 0 and len(s) % blocksize: - s = (blocksize - len(s) % blocksize) * b'\000' + s - return s - - -def bytes_to_long(s): - """bytes_to_long(string) : long - Convert a byte string to a long integer. - - This is (essentially) the inverse of long_to_bytes(). - """ - acc = 0 - length = len(s) - if length % 4: - extra = (4 - length % 4) - s = b'\000' * extra + s - length = length + extra - for i in range(0, length, 4): - acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0] - return acc - - -def ohdave_rsa_encrypt(data, exponent, modulus): - ''' - Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ - - Input: - data: data to encrypt, bytes-like object - exponent, modulus: parameter e and N of RSA algorithm, both integer - Output: hex string of encrypted data - - Limitation: supports one block encryption only - ''' - - payload = int(binascii.hexlify(data[::-1]), 16) - encrypted = pow(payload, exponent, modulus) - return '%x' % encrypted - - -def pkcs1pad(data, length): - """ - Padding input data with PKCS#1 scheme - - @param {int[]} data input data - @param {int} length target length - @returns {int[]} padded data - """ - if len(data) > length - 11: - raise ValueError('Input data too long for PKCS#1 padding') - - pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)] - return [0, 2] + pseudo_random + [0] + data - - -def _base_n_table(n, table): - if not table and not n: - raise ValueError('Either table or n must be specified') - table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n] - - if n and n != len(table): - raise ValueError(f'base {n} exceeds table length {len(table)}') - return table - - -def encode_base_n(num, n=None, table=None): - """Convert given int to a base-n string""" - table = _base_n_table(n, table) - if not num: - return table[0] - - result, base = '', len(table) - while num: - result = table[num % base] + result - num = num // base - return result - - -def decode_base_n(string, n=None, table=None): - """Convert given base-n string to int""" - table = {char: index for index, char in enumerate(_base_n_table(n, table))} - result, base = 0, len(table) - for char in string: - result = result * base + table[char] - return result - - -def decode_base(value, digits): - deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed ' - f'in a future version. Use {__name__}.decode_base_n instead') - return decode_base_n(value, table=digits) - - -def decode_packed_codes(code): - mobj = re.search(PACKED_CODES_RE, code) - obfuscated_code, base, count, symbols = mobj.groups() - base = int(base) - count = int(count) - symbols = symbols.split('|') - symbol_table = {} - - while count: - count -= 1 - base_n_count = encode_base_n(count, base) - symbol_table[base_n_count] = symbols[count] or base_n_count - - return re.sub( - r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], - obfuscated_code) - - -def caesar(s, alphabet, shift): - if shift == 0: - return s - l = len(alphabet) - return ''.join( - alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c - for c in s) - - -def rot47(s): - return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47) - - -def parse_m3u8_attributes(attrib): - info = {} - for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): - if val.startswith('"'): - val = val[1:-1] - info[key] = val - return info - - -def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - - -# Based on png2str() written by @gdkchan and improved by @yokrysty -# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 -def decode_png(png_data): - # Reference: https://www.w3.org/TR/PNG/ - header = png_data[8:] - - if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': - raise OSError('Not a valid PNG file.') - - int_map = {1: '>B', 2: '>H', 4: '>I'} - unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0] - - chunks = [] - - while header: - length = unpack_integer(header[:4]) - header = header[4:] - - chunk_type = header[:4] - header = header[4:] - - chunk_data = header[:length] - header = header[length:] - - header = header[4:] # Skip CRC - - chunks.append({ - 'type': chunk_type, - 'length': length, - 'data': chunk_data - }) - - ihdr = chunks[0]['data'] - - width = unpack_integer(ihdr[:4]) - height = unpack_integer(ihdr[4:8]) - - idat = b'' - - for chunk in chunks: - if chunk['type'] == b'IDAT': - idat += chunk['data'] - - if not idat: - raise OSError('Unable to read PNG data.') - - decompressed_data = bytearray(zlib.decompress(idat)) - - stride = width * 3 - pixels = [] - - def _get_pixel(idx): - x = idx % stride - y = idx // stride - return pixels[y][x] - - for y in range(height): - basePos = y * (1 + stride) - filter_type = decompressed_data[basePos] - - current_row = [] - - pixels.append(current_row) - - for x in range(stride): - color = decompressed_data[1 + basePos + x] - basex = y * stride + x - left = 0 - up = 0 - - if x > 2: - left = _get_pixel(basex - 3) - if y > 0: - up = _get_pixel(basex - stride) - - if filter_type == 1: # Sub - color = (color + left) & 0xff - elif filter_type == 2: # Up - color = (color + up) & 0xff - elif filter_type == 3: # Average - color = (color + ((left + up) >> 1)) & 0xff - elif filter_type == 4: # Paeth - a = left - b = up - c = 0 - - if x > 2 and y > 0: - c = _get_pixel(basex - stride - 3) - - p = a + b - c - - pa = abs(p - a) - pb = abs(p - b) - pc = abs(p - c) - - if pa <= pb and pa <= pc: - color = (color + a) & 0xff - elif pb <= pc: - color = (color + b) & 0xff - else: - color = (color + c) & 0xff - - current_row.append(color) - - return width, height, pixels - - -def write_xattr(path, key, value): - # Windows: Write xattrs to NTFS Alternate Data Streams: - # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 - if compat_os_name == 'nt': - assert ':' not in key - assert os.path.exists(path) - - try: - with open(f'{path}:{key}', 'wb') as f: - f.write(value) - except OSError as e: - raise XAttrMetadataError(e.errno, e.strerror) - return - - # UNIX Method 1. Use xattrs/pyxattrs modules - - setxattr = None - if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': - # Unicode arguments are not supported in pyxattr until version 0.5.0 - # See https://github.com/ytdl-org/youtube-dl/issues/5498 - if version_tuple(xattr.__version__) >= (0, 5, 0): - setxattr = xattr.set - elif xattr: - setxattr = xattr.setxattr - - if setxattr: - try: - setxattr(path, key, value) - except OSError as e: - raise XAttrMetadataError(e.errno, e.strerror) - return - - # UNIX Method 2. Use setfattr/xattr executables - exe = ('setfattr' if check_executable('setfattr', ['--version']) - else 'xattr' if check_executable('xattr', ['-h']) else None) - if not exe: - raise XAttrUnavailableError( - 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the ' - + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)')) - - value = value.decode() - try: - _, stderr, returncode = Popen.run( - [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path], - text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - except OSError as e: - raise XAttrMetadataError(e.errno, e.strerror) - if returncode: - raise XAttrMetadataError(returncode, stderr) - - -def random_birthday(year_field, month_field, day_field): - start_date = datetime.date(1950, 1, 1) - end_date = datetime.date(1995, 12, 31) - offset = random.randint(0, (end_date - start_date).days) - random_date = start_date + datetime.timedelta(offset) - return { - year_field: str(random_date.year), - month_field: str(random_date.month), - day_field: str(random_date.day), - } - - -def find_available_port(interface=''): - try: - with socket.socket() as sock: - sock.bind((interface, 0)) - return sock.getsockname()[1] - except OSError: - return None - - -# Templates for internet shortcut files, which are plain text files. -DOT_URL_LINK_TEMPLATE = '''\ -[InternetShortcut] -URL=%(url)s -''' - -DOT_WEBLOC_LINK_TEMPLATE = '''\ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> -<plist version="1.0"> -<dict> -\t<key>URL</key> -\t<string>%(url)s</string> -</dict> -</plist> -''' - -DOT_DESKTOP_LINK_TEMPLATE = '''\ -[Desktop Entry] -Encoding=UTF-8 -Name=%(filename)s -Type=Link -URL=%(url)s -Icon=text-html -''' - -LINK_TEMPLATES = { - 'url': DOT_URL_LINK_TEMPLATE, - 'desktop': DOT_DESKTOP_LINK_TEMPLATE, - 'webloc': DOT_WEBLOC_LINK_TEMPLATE, -} - - -def iri_to_uri(iri): - """ - Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only). - - The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact. - """ - - iri_parts = urllib.parse.urlparse(iri) - - if '[' in iri_parts.netloc: - raise ValueError('IPv6 URIs are not, yet, supported.') - # Querying `.netloc`, when there's only one bracket, also raises a ValueError. - - # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is. - - net_location = '' - if iri_parts.username: - net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~") - if iri_parts.password is not None: - net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~") - net_location += '@' - - net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames. - # The 'idna' encoding produces ASCII text. - if iri_parts.port is not None and iri_parts.port != 80: - net_location += ':' + str(iri_parts.port) - - return urllib.parse.urlunparse( - (iri_parts.scheme, - net_location, - - urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"), - - # Unsure about the `safe` argument, since this is a legacy way of handling parameters. - urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"), - - # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component. - urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"), - - urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~"))) - - # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes. - - -def to_high_limit_path(path): - if sys.platform in ['win32', 'cygwin']: - # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited. - return '\\\\?\\' + os.path.abspath(path) - - return path - - -def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): - val = traverse_obj(obj, *variadic(field)) - if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore): - return default - return template % func(val) - - -def clean_podcast_url(url): - return re.sub(r'''(?x) - (?: - (?: - chtbl\.com/track| - media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ - play\.podtrac\.com - )/[^/]+| - (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure - flex\.acast\.com| - pd(?: - cn\.co| # https://podcorn.com/analytics-prefix/ - st\.fm # https://podsights.com/docs/ - )/e - )/''', '', url) - - -_HEX_TABLE = '0123456789abcdef' - - -def random_uuidv4(): - return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx') - - -def make_dir(path, to_screen=None): - try: - dn = os.path.dirname(path) - if dn: - os.makedirs(dn, exist_ok=True) - return True - except OSError as err: - if callable(to_screen) is not None: - to_screen('unable to create directory ' + error_to_compat_str(err)) - return False - - -def get_executable_path(): - from .update import _get_variant_and_executable_path - - return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1])) - - -def get_user_config_dirs(package_name): - # .config (e.g. ~/.config/package_name) - xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') - yield os.path.join(xdg_config_home, package_name) - - # appdata (%APPDATA%/package_name) - appdata_dir = os.getenv('appdata') - if appdata_dir: - yield os.path.join(appdata_dir, package_name) - - # home (~/.package_name) - yield os.path.join(compat_expanduser('~'), f'.{package_name}') - - -def get_system_config_dirs(package_name): - # /etc/package_name - yield os.path.join('/etc', package_name) - - -def traverse_obj( - obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, - casesense=True, is_user_input=False, traverse_string=False): - """ - Safely traverse nested `dict`s and `Iterable`s - - >>> obj = [{}, {"key": "value"}] - >>> traverse_obj(obj, (1, "key")) - "value" - - Each of the provided `paths` is tested and the first producing a valid result will be returned. - The next path will also be tested if the path branched but no results could be found. - Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. - Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. - - The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. - - The keys in the path can be one of: - - `None`: Return the current object. - - `set`: Requires the only item in the set to be a type or function, - like `{type}`/`{func}`. If a `type`, returns only values - of this type. If a function, returns `func(obj)`. - - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. - - `slice`: Branch out and return all values in `obj[key]`. - - `Ellipsis`: Branch out and return a list of all values. - - `tuple`/`list`: Branch out and return a list of all matching values. - Read as: `[traverse_obj(obj, branch) for branch in branches]`. - - `function`: Branch out and return values filtered by the function. - Read as: `[value for key, value in obj if function(key, value)]`. - For `Iterable`s, `key` is the index of the value. - For `re.Match`es, `key` is the group number (0 = full match) - as well as additionally any group names, if given. - - `dict` Transform the current object and return a matching dict. - Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. - - `tuple`, `list`, and `dict` all support nested paths and branches. - - @params paths Paths which to traverse by. - @param default Value to return if the paths do not match. - If the last key in the path is a `dict`, it will apply to each value inside - the dict instead, depth first. Try to avoid if using nested `dict` keys. - @param expected_type If a `type`, only accept final values of this type. - If any other callable, try to call the function on each result. - If the last key in the path is a `dict`, it will apply to each value inside - the dict instead, recursively. This does respect branching paths. - @param get_all If `False`, return the first matching result, otherwise all matching ones. - @param casesense If `False`, consider string dictionary keys as case insensitive. - - The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API - - @param is_user_input Whether the keys are generated from user input. - If `True` strings get converted to `int`/`slice` if needed. - @param traverse_string Whether to traverse into objects as strings. - If `True`, any non-compatible object will first be - converted into a string and then traversed into. - The return value of that path will be a string instead, - not respecting any further branching. - - - @returns The result of the object traversal. - If successful, `get_all=True`, and the path branches at least once, - then a list of results is returned instead. - If no `default` is given and the last path branches, a `list` of results - is always returned. If a path ends on a `dict` that result will always be a `dict`. - """ - casefold = lambda k: k.casefold() if isinstance(k, str) else k - - if isinstance(expected_type, type): - type_test = lambda val: val if isinstance(val, expected_type) else None - else: - type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) - - def apply_key(key, obj, is_last): - branching = False - result = None - - if obj is None and traverse_string: - if key is ... or callable(key) or isinstance(key, slice): - branching = True - result = () - - elif key is None: - result = obj - - elif isinstance(key, set): - assert len(key) == 1, 'Set should only be used to wrap a single item' - item = next(iter(key)) - if isinstance(item, type): - if isinstance(obj, item): - result = obj - else: - result = try_call(item, args=(obj,)) - - elif isinstance(key, (list, tuple)): - branching = True - result = itertools.chain.from_iterable( - apply_path(obj, branch, is_last)[0] for branch in key) - - elif key is ...: - branching = True - if isinstance(obj, collections.abc.Mapping): - result = obj.values() - elif is_iterable_like(obj): - result = obj - elif isinstance(obj, re.Match): - result = obj.groups() - elif traverse_string: - branching = False - result = str(obj) - else: - result = () - - elif callable(key): - branching = True - if isinstance(obj, collections.abc.Mapping): - iter_obj = obj.items() - elif is_iterable_like(obj): - iter_obj = enumerate(obj) - elif isinstance(obj, re.Match): - iter_obj = itertools.chain( - enumerate((obj.group(), *obj.groups())), - obj.groupdict().items()) - elif traverse_string: - branching = False - iter_obj = enumerate(str(obj)) - else: - iter_obj = () - - result = (v for k, v in iter_obj if try_call(key, args=(k, v))) - if not branching: # string traversal - result = ''.join(result) - - elif isinstance(key, dict): - iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items()) - result = { - k: v if v is not None else default for k, v in iter_obj - if v is not None or default is not NO_DEFAULT - } or None - - elif isinstance(obj, collections.abc.Mapping): - result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else - next((v for k, v in obj.items() if casefold(k) == key), None)) - - elif isinstance(obj, re.Match): - if isinstance(key, int) or casesense: - with contextlib.suppress(IndexError): - result = obj.group(key) - - elif isinstance(key, str): - result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) - - elif isinstance(key, (int, slice)): - if is_iterable_like(obj, collections.abc.Sequence): - branching = isinstance(key, slice) - with contextlib.suppress(IndexError): - result = obj[key] - elif traverse_string: - with contextlib.suppress(IndexError): - result = str(obj)[key] - - return branching, result if branching else (result,) - - def lazy_last(iterable): - iterator = iter(iterable) - prev = next(iterator, NO_DEFAULT) - if prev is NO_DEFAULT: - return - - for item in iterator: - yield False, prev - prev = item - - yield True, prev - - def apply_path(start_obj, path, test_type): - objs = (start_obj,) - has_branched = False - - key = None - for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): - if is_user_input and isinstance(key, str): - if key == ':': - key = ... - elif ':' in key: - key = slice(*map(int_or_none, key.split(':'))) - elif int_or_none(key) is not None: - key = int(key) - - if not casesense and isinstance(key, str): - key = key.casefold() - - if __debug__ and callable(key): - # Verify function signature - inspect.signature(key).bind(None, None) - - new_objs = [] - for obj in objs: - branching, results = apply_key(key, obj, last) - has_branched |= branching - new_objs.append(results) - - objs = itertools.chain.from_iterable(new_objs) - - if test_type and not isinstance(key, (dict, list, tuple)): - objs = map(type_test, objs) - - return objs, has_branched, isinstance(key, dict) - - def _traverse_obj(obj, path, allow_empty, test_type): - results, has_branched, is_dict = apply_path(obj, path, test_type) - results = LazyList(item for item in results if item not in (None, {})) - if get_all and has_branched: - if results: - return results.exhaust() - if allow_empty: - return [] if default is NO_DEFAULT else default - return None - - return results[0] if results else {} if allow_empty and is_dict else None - - for index, path in enumerate(paths, 1): - result = _traverse_obj(obj, path, index == len(paths), True) - if result is not None: - return result - - return None if default is NO_DEFAULT else default - - -def traverse_dict(dictn, keys, casesense=True): - deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed ' - f'in a future version. Use "{__name__}.traverse_obj" instead') - return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) - - -def get_first(obj, keys, **kwargs): - return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) - - -def time_seconds(**kwargs): - """ - Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z) - """ - return time.time() + datetime.timedelta(**kwargs).total_seconds() - - -# create a JSON Web Signature (jws) with HS256 algorithm -# the resulting format is in JWS Compact Serialization -# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html -# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html -def jwt_encode_hs256(payload_data, key, headers={}): - header_data = { - 'alg': 'HS256', - 'typ': 'JWT', - } - if headers: - header_data.update(headers) - header_b64 = base64.b64encode(json.dumps(header_data).encode()) - payload_b64 = base64.b64encode(json.dumps(payload_data).encode()) - h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256) - signature_b64 = base64.b64encode(h.digest()) - token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64 - return token - - -# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256 -def jwt_decode_hs256(jwt): - header_b64, payload_b64, signature_b64 = jwt.split('.') - # add trailing ='s that may have been stripped, superfluous ='s are ignored - payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}===')) - return payload_data - - -WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None - - -@functools.cache -def supports_terminal_sequences(stream): - if compat_os_name == 'nt': - if not WINDOWS_VT_MODE: - return False - elif not os.getenv('TERM'): - return False - try: - return stream.isatty() - except BaseException: - return False - - -def windows_enable_vt_mode(): - """Ref: https://bugs.python.org/issue30075 """ - if get_windows_version() < (10, 0, 10586): - return - - import ctypes - import ctypes.wintypes - import msvcrt - - ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004 - - dll = ctypes.WinDLL('kernel32', use_last_error=False) - handle = os.open('CONOUT$', os.O_RDWR) - try: - h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle)) - dw_original_mode = ctypes.wintypes.DWORD() - success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode)) - if not success: - raise Exception('GetConsoleMode failed') - - success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD( - dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) - if not success: - raise Exception('SetConsoleMode failed') - finally: - os.close(handle) - - global WINDOWS_VT_MODE - WINDOWS_VT_MODE = True - supports_terminal_sequences.cache_clear() - - -_terminal_sequences_re = re.compile('\033\\[[^m]+m') - - -def remove_terminal_sequences(string): - return _terminal_sequences_re.sub('', string) - - -def number_of_digits(number): - return len('%d' % number) - - -def join_nonempty(*values, delim='-', from_dict=None): - if from_dict is not None: - values = (traverse_obj(from_dict, variadic(v)) for v in values) - return delim.join(map(str, filter(None, values))) - - -def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re): - """ - Find the largest format dimensions in terms of video width and, for each thumbnail: - * Modify the URL: Match the width with the provided regex and replace with the former width - * Update dimensions - - This function is useful with video services that scale the provided thumbnails on demand - """ - _keys = ('width', 'height') - max_dimensions = max( - (tuple(format.get(k) or 0 for k in _keys) for format in formats), - default=(0, 0)) - if not max_dimensions[0]: - return thumbnails - return [ - merge_dicts( - {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}, - dict(zip(_keys, max_dimensions)), thumbnail) - for thumbnail in thumbnails - ] - - -def parse_http_range(range): - """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """ - if not range: - return None, None, None - crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range) - if not crg: - return None, None, None - return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3)) - - -def read_stdin(what): - eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D' - write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n') - return sys.stdin - - -def determine_file_encoding(data): - """ - Detect the text encoding used - @returns (encoding, bytes to skip) - """ - - # BOM marks are given priority over declarations - for bom, enc in BOMS: - if data.startswith(bom): - return enc, len(bom) - - # Strip off all null bytes to match even when UTF-16 or UTF-32 is used. - # We ignore the endianness to get a good enough match - data = data.replace(b'\0', b'') - mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data) - return mobj.group(1).decode() if mobj else None, 0 - - -class Config: - own_args = None - parsed_args = None - filename = None - __initialized = False - - def __init__(self, parser, label=None): - self.parser, self.label = parser, label - self._loaded_paths, self.configs = set(), [] - - def init(self, args=None, filename=None): - assert not self.__initialized - self.own_args, self.filename = args, filename - return self.load_configs() - - def load_configs(self): - directory = '' - if self.filename: - location = os.path.realpath(self.filename) - directory = os.path.dirname(location) - if location in self._loaded_paths: - return False - self._loaded_paths.add(location) - - self.__initialized = True - opts, _ = self.parser.parse_known_args(self.own_args) - self.parsed_args = self.own_args - for location in opts.config_locations or []: - if location == '-': - if location in self._loaded_paths: - continue - self._loaded_paths.add(location) - self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin') - continue - location = os.path.join(directory, expand_path(location)) - if os.path.isdir(location): - location = os.path.join(location, 'yt-dlp.conf') - if not os.path.exists(location): - self.parser.error(f'config location {location} does not exist') - self.append_config(self.read_file(location), location) - return True - - def __str__(self): - label = join_nonempty( - self.label, 'config', f'"{self.filename}"' if self.filename else '', - delim=' ') - return join_nonempty( - self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}', - *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs), - delim='\n') - - @staticmethod - def read_file(filename, default=[]): - try: - optionf = open(filename, 'rb') - except OSError: - return default # silently skip if file is not present - try: - enc, skip = determine_file_encoding(optionf.read(512)) - optionf.seek(skip, io.SEEK_SET) - except OSError: - enc = None # silently skip read errors - try: - # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 - contents = optionf.read().decode(enc or preferredencoding()) - res = shlex.split(contents, comments=True) - except Exception as err: - raise ValueError(f'Unable to parse "{filename}": {err}') - finally: - optionf.close() - return res - - @staticmethod - def hide_login_info(opts): - PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'} - eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') - - def _scrub_eq(o): - m = eqre.match(o) - if m: - return m.group('key') + '=PRIVATE' - else: - return o - - opts = list(map(_scrub_eq, opts)) - for idx, opt in enumerate(opts): - if opt in PRIVATE_OPTS and idx + 1 < len(opts): - opts[idx + 1] = 'PRIVATE' - return opts - - def append_config(self, *args, label=None): - config = type(self)(self.parser, label) - config._loaded_paths = self._loaded_paths - if config.init(*args): - self.configs.append(config) - - @property - def all_args(self): - for config in reversed(self.configs): - yield from config.all_args - yield from self.parsed_args or [] - - def parse_known_args(self, **kwargs): - return self.parser.parse_known_args(self.all_args, **kwargs) - - def parse_args(self): - return self.parser.parse_args(self.all_args) - - -class WebSocketsWrapper: - """Wraps websockets module to use in non-async scopes""" - pool = None - - def __init__(self, url, headers=None, connect=True): - self.loop = asyncio.new_event_loop() - # XXX: "loop" is deprecated - self.conn = websockets.connect( - url, extra_headers=headers, ping_interval=None, - close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf')) - if connect: - self.__enter__() - atexit.register(self.__exit__, None, None, None) - - def __enter__(self): - if not self.pool: - self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop) - return self - - def send(self, *args): - self.run_with_loop(self.pool.send(*args), self.loop) - - def recv(self, *args): - return self.run_with_loop(self.pool.recv(*args), self.loop) - - def __exit__(self, type, value, traceback): - try: - return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop) - finally: - self.loop.close() - self._cancel_all_tasks(self.loop) - - # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications - # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class - @staticmethod - def run_with_loop(main, loop): - if not asyncio.iscoroutine(main): - raise ValueError(f'a coroutine was expected, got {main!r}') - - try: - return loop.run_until_complete(main) - finally: - loop.run_until_complete(loop.shutdown_asyncgens()) - if hasattr(loop, 'shutdown_default_executor'): - loop.run_until_complete(loop.shutdown_default_executor()) - - @staticmethod - def _cancel_all_tasks(loop): - to_cancel = asyncio.all_tasks(loop) - - if not to_cancel: - return - - for task in to_cancel: - task.cancel() - - # XXX: "loop" is removed in python 3.10+ - loop.run_until_complete( - asyncio.gather(*to_cancel, loop=loop, return_exceptions=True)) - - for task in to_cancel: - if task.cancelled(): - continue - if task.exception() is not None: - loop.call_exception_handler({ - 'message': 'unhandled exception during asyncio.run() shutdown', - 'exception': task.exception(), - 'task': task, - }) - - -def merge_headers(*dicts): - """Merge dicts of http headers case insensitively, prioritizing the latter ones""" - return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))} - - -def cached_method(f): - """Cache a method""" - signature = inspect.signature(f) - - @functools.wraps(f) - def wrapper(self, *args, **kwargs): - bound_args = signature.bind(self, *args, **kwargs) - bound_args.apply_defaults() - key = tuple(bound_args.arguments.values())[1:] - - cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {}) - if key not in cache: - cache[key] = f(self, *args, **kwargs) - return cache[key] - return wrapper - - -class classproperty: - """property access for class methods with optional caching""" - def __new__(cls, func=None, *args, **kwargs): - if not func: - return functools.partial(cls, *args, **kwargs) - return super().__new__(cls) - - def __init__(self, func, *, cache=False): - functools.update_wrapper(self, func) - self.func = func - self._cache = {} if cache else None - - def __get__(self, _, cls): - if self._cache is None: - return self.func(cls) - elif cls not in self._cache: - self._cache[cls] = self.func(cls) - return self._cache[cls] - - -class function_with_repr: - def __init__(self, func, repr_=None): - functools.update_wrapper(self, func) - self.func, self.__repr = func, repr_ - - def __call__(self, *args, **kwargs): - return self.func(*args, **kwargs) - - def __repr__(self): - if self.__repr: - return self.__repr - return f'{self.func.__module__}.{self.func.__qualname__}' - - -class Namespace(types.SimpleNamespace): - """Immutable namespace""" - - def __iter__(self): - return iter(self.__dict__.values()) - - @property - def items_(self): - return self.__dict__.items() - - -MEDIA_EXTENSIONS = Namespace( - common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'), - video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'), - common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'), - audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'), - thumbnails=('jpg', 'png', 'webp'), - storyboards=('mhtml', ), - subtitles=('srt', 'vtt', 'ass', 'lrc'), - manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'), -) -MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video -MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio - -KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests) - - -class RetryManager: - """Usage: - for retry in RetryManager(...): - try: - ... - except SomeException as err: - retry.error = err - continue - """ - attempt, _error = 0, None - - def __init__(self, _retries, _error_callback, **kwargs): - self.retries = _retries or 0 - self.error_callback = functools.partial(_error_callback, **kwargs) - - def _should_retry(self): - return self._error is not NO_DEFAULT and self.attempt <= self.retries - - @property - def error(self): - if self._error is NO_DEFAULT: - return None - return self._error - - @error.setter - def error(self, value): - self._error = value - - def __iter__(self): - while self._should_retry(): - self.error = NO_DEFAULT - self.attempt += 1 - yield self - if self.error: - self.error_callback(self.error, self.attempt, self.retries) - - @staticmethod - def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None): - """Utility function for reporting retries""" - if count > retries: - if error: - return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e)) - raise e - - if not count: - return warn(e) - elif isinstance(e, ExtractorError): - e = remove_end(str_or_none(e.cause) or e.orig_msg, '.') - warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...') - - delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func - if delay: - info(f'Sleeping {delay:.2f} seconds ...') - time.sleep(delay) - - -def make_archive_id(ie, video_id): - ie_key = ie if isinstance(ie, str) else ie.ie_key() - return f'{ie_key.lower()} {video_id}' - - -def truncate_string(s, left, right=0): - assert left > 3 and right >= 0 - if s is None or len(s) <= left + right: - return s - return f'{s[:left-3]}...{s[-right:] if right else ""}' - - -def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None): - assert 'all' in alias_dict, '"all" alias is required' - requested = list(start or []) - for val in options: - discard = val.startswith('-') - if discard: - val = val[1:] - - if val in alias_dict: - val = alias_dict[val] if not discard else [ - i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]] - # NB: Do not allow regex in aliases for performance - requested = orderedSet_from_options(val, alias_dict, start=requested) - continue - - current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex - else [val] if val in alias_dict['all'] else None) - if current is None: - raise ValueError(val) - - if discard: - for item in current: - while item in requested: - requested.remove(item) - else: - requested.extend(current) - - return orderedSet(requested) - - -class FormatSorter: - regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$' - - default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', - 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec', - 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases - ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', - 'height', 'width', 'proto', 'vext', 'abr', 'aext', - 'fps', 'fs_approx', 'source', 'id') - - settings = { - 'vcodec': {'type': 'ordered', 'regex': True, - 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, - 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, - 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', - 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, - 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', - 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, - 'vext': {'type': 'ordered', 'field': 'video_ext', - 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'), - 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')}, - 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext', - 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'), - 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')}, - 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, - 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', - 'field': ('vcodec', 'acodec'), - 'function': lambda it: int(any(v != 'none' for v in it))}, - 'ie_pref': {'priority': True, 'type': 'extractor'}, - 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, - 'quality': {'convert': 'float', 'default': -1}, - 'filesize': {'convert': 'bytes'}, - 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, - 'id': {'convert': 'string', 'field': 'format_id'}, - 'height': {'convert': 'float_none'}, - 'width': {'convert': 'float_none'}, - 'fps': {'convert': 'float_none'}, - 'channels': {'convert': 'float_none', 'field': 'audio_channels'}, - 'tbr': {'convert': 'float_none'}, - 'vbr': {'convert': 'float_none'}, - 'abr': {'convert': 'float_none'}, - 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, - - 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, - 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, - 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')}, - 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, - 'res': {'type': 'multiple', 'field': ('height', 'width'), - 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - - # Actual field names - 'format_id': {'type': 'alias', 'field': 'id'}, - 'preference': {'type': 'alias', 'field': 'ie_pref'}, - 'language_preference': {'type': 'alias', 'field': 'lang'}, - 'source_preference': {'type': 'alias', 'field': 'source'}, - 'protocol': {'type': 'alias', 'field': 'proto'}, - 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, - 'audio_channels': {'type': 'alias', 'field': 'channels'}, - - # Deprecated - 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, - 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, - 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, - 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, - 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, - 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, - 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, - 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, - 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, - 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, - 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, - 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, - 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, - 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, - 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, - 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, - 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, - 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, - 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, - 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, - } - - def __init__(self, ydl, field_preference): - self.ydl = ydl - self._order = [] - self.evaluate_params(self.ydl.params, field_preference) - if ydl.params.get('verbose'): - self.print_verbose_info(self.ydl.write_debug) - - def _get_field_setting(self, field, key): - if field not in self.settings: - if key in ('forced', 'priority'): - return False - self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is ' - 'deprecated and may be removed in a future version') - self.settings[field] = {} - propObj = self.settings[field] - if key not in propObj: - type = propObj.get('type') - if key == 'field': - default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field - elif key == 'convert': - default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore' - else: - default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None) - propObj[key] = default - return propObj[key] - - def _resolve_field_value(self, field, value, convertNone=False): - if value is None: - if not convertNone: - return None - else: - value = value.lower() - conversion = self._get_field_setting(field, 'convert') - if conversion == 'ignore': - return None - if conversion == 'string': - return value - elif conversion == 'float_none': - return float_or_none(value) - elif conversion == 'bytes': - return parse_bytes(value) - elif conversion == 'order': - order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order') - use_regex = self._get_field_setting(field, 'regex') - list_length = len(order_list) - empty_pos = order_list.index('') if '' in order_list else list_length + 1 - if use_regex and value is not None: - for i, regex in enumerate(order_list): - if regex and re.match(regex, value): - return list_length - i - return list_length - empty_pos # not in list - else: # not regex or value = None - return list_length - (order_list.index(value) if value in order_list else empty_pos) - else: - if value.isnumeric(): - return float(value) - else: - self.settings[field]['convert'] = 'string' - return value - - def evaluate_params(self, params, sort_extractor): - self._use_free_order = params.get('prefer_free_formats', False) - self._sort_user = params.get('format_sort', []) - self._sort_extractor = sort_extractor - - def add_item(field, reverse, closest, limit_text): - field = field.lower() - if field in self._order: - return - self._order.append(field) - limit = self._resolve_field_value(field, limit_text) - data = { - 'reverse': reverse, - 'closest': False if limit is None else closest, - 'limit_text': limit_text, - 'limit': limit} - if field in self.settings: - self.settings[field].update(data) - else: - self.settings[field] = data - - sort_list = ( - tuple(field for field in self.default if self._get_field_setting(field, 'forced')) - + (tuple() if params.get('format_sort_force', False) - else tuple(field for field in self.default if self._get_field_setting(field, 'priority'))) - + tuple(self._sort_user) + tuple(sort_extractor) + self.default) - - for item in sort_list: - match = re.match(self.regex, item) - if match is None: - raise ExtractorError('Invalid format sort string "%s" given by extractor' % item) - field = match.group('field') - if field is None: - continue - if self._get_field_setting(field, 'type') == 'alias': - alias, field = field, self._get_field_setting(field, 'field') - if self._get_field_setting(alias, 'deprecated'): - self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may ' - f'be removed in a future version. Please use {field} instead') - reverse = match.group('reverse') is not None - closest = match.group('separator') == '~' - limit_text = match.group('limit') - - has_limit = limit_text is not None - has_multiple_fields = self._get_field_setting(field, 'type') == 'combined' - has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') - - fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) - limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() - limit_count = len(limits) - for (i, f) in enumerate(fields): - add_item(f, reverse, closest, - limits[i] if i < limit_count - else limits[0] if has_limit and not has_multiple_limits - else None) - - def print_verbose_info(self, write_debug): - if self._sort_user: - write_debug('Sort order given by user: %s' % ', '.join(self._sort_user)) - if self._sort_extractor: - write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor)) - write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % ( - '+' if self._get_field_setting(field, 'reverse') else '', field, - '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':', - self._get_field_setting(field, 'limit_text'), - self._get_field_setting(field, 'limit')) - if self._get_field_setting(field, 'limit_text') is not None else '') - for field in self._order if self._get_field_setting(field, 'visible')])) - - def _calculate_field_preference_from_value(self, format, field, type, value): - reverse = self._get_field_setting(field, 'reverse') - closest = self._get_field_setting(field, 'closest') - limit = self._get_field_setting(field, 'limit') - - if type == 'extractor': - maximum = self._get_field_setting(field, 'max') - if value is None or (maximum is not None and value >= maximum): - value = -1 - elif type == 'boolean': - in_list = self._get_field_setting(field, 'in_list') - not_in_list = self._get_field_setting(field, 'not_in_list') - value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1 - elif type == 'ordered': - value = self._resolve_field_value(field, value, True) - - # try to convert to number - val_num = float_or_none(value, default=self._get_field_setting(field, 'default')) - is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None - if is_num: - value = val_num - - return ((-10, 0) if value is None - else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher - else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest - else (0, value, 0) if not reverse and (limit is None or value <= limit) - else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit - else (-1, value, 0)) - - def _calculate_field_preference(self, format, field): - type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple - get_value = lambda f: format.get(self._get_field_setting(f, 'field')) - if type == 'multiple': - type = 'field' # Only 'field' is allowed in multiple for now - actual_fields = self._get_field_setting(field, 'field') - - value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields) - else: - value = get_value(field) - return self._calculate_field_preference_from_value(format, field, type, value) - - def calculate_preference(self, format): - # Determine missing protocol - if not format.get('protocol'): - format['protocol'] = determine_protocol(format) - - # Determine missing ext - if not format.get('ext') and 'url' in format: - format['ext'] = determine_ext(format['url']) - if format.get('vcodec') == 'none': - format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' - format['video_ext'] = 'none' - else: - format['video_ext'] = format['ext'] - format['audio_ext'] = 'none' - # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported? - # format['preference'] = -1000 - - if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''): - # HEVC-over-FLV is out-of-spec by FLV's original spec - # ref. https://trac.ffmpeg.org/ticket/6389 - # ref. https://github.com/yt-dlp/yt-dlp/pull/5821 - format['preference'] = -100 - - # Determine missing bitrates - if format.get('tbr') is None: - if format.get('vbr') is not None and format.get('abr') is not None: - format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) - else: - if format.get('vcodec') != 'none' and format.get('vbr') is None: - format['vbr'] = format.get('tbr') - format.get('abr', 0) - if format.get('acodec') != 'none' and format.get('abr') is None: - format['abr'] = format.get('tbr') - format.get('vbr', 0) - - return tuple(self._calculate_field_preference(format, field) for field in self._order) - - -# Deprecated -has_certifi = bool(certifi) -has_websockets = bool(websockets) - - -def load_plugins(name, suffix, namespace): - from .plugins import load_plugins - ret = load_plugins(name, suffix) - namespace.update(ret) - return ret +import asyncio +import atexit +import base64 +import binascii +import calendar +import codecs +import collections +import collections.abc +import contextlib +import datetime +import email.header +import email.utils +import errno +import gzip +import hashlib +import hmac +import html.entities +import html.parser +import http.client +import http.cookiejar +import inspect +import io +import itertools +import json +import locale +import math +import mimetypes +import netrc +import operator +import os +import platform +import random +import re +import shlex +import socket +import ssl +import struct +import subprocess +import sys +import tempfile +import time +import traceback +import types +import unicodedata +import urllib.error +import urllib.parse +import urllib.request +import xml.etree.ElementTree +import zlib + +from . import traversal + +from ..compat import functools # isort: split +from ..compat import ( + compat_etree_fromstring, + compat_expanduser, + compat_HTMLParseError, + compat_os_name, + compat_shlex_quote, +) +from ..dependencies import brotli, certifi, websockets, xattr +from ..socks import ProxyType, sockssocket + +__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module + +# This is not clearly defined otherwise +compiled_regex_type = type(re.compile('')) + + +def random_user_agent(): + _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36' + _CHROME_VERSIONS = ( + '90.0.4430.212', + '90.0.4430.24', + '90.0.4430.70', + '90.0.4430.72', + '90.0.4430.85', + '90.0.4430.93', + '91.0.4472.101', + '91.0.4472.106', + '91.0.4472.114', + '91.0.4472.124', + '91.0.4472.164', + '91.0.4472.19', + '91.0.4472.77', + '92.0.4515.107', + '92.0.4515.115', + '92.0.4515.131', + '92.0.4515.159', + '92.0.4515.43', + '93.0.4556.0', + '93.0.4577.15', + '93.0.4577.63', + '93.0.4577.82', + '94.0.4606.41', + '94.0.4606.54', + '94.0.4606.61', + '94.0.4606.71', + '94.0.4606.81', + '94.0.4606.85', + '95.0.4638.17', + '95.0.4638.50', + '95.0.4638.54', + '95.0.4638.69', + '95.0.4638.74', + '96.0.4664.18', + '96.0.4664.45', + '96.0.4664.55', + '96.0.4664.93', + '97.0.4692.20', + ) + return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) + + +SUPPORTED_ENCODINGS = [ + 'gzip', 'deflate' +] +if brotli: + SUPPORTED_ENCODINGS.append('br') + +std_headers = { + 'User-Agent': random_user_agent(), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-us,en;q=0.5', + 'Sec-Fetch-Mode': 'navigate', +} + + +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + + +class NO_DEFAULT: + pass + + +def IDENTITY(x): + return x + + +ENGLISH_MONTH_NAMES = [ + 'January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', 'December'] + +MONTH_NAMES = { + 'en': ENGLISH_MONTH_NAMES, + 'fr': [ + 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', + 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], + # these follow the genitive grammatical case (dopełniacz) + # some websites might be using nominative, which will require another month list + # https://en.wikibooks.org/wiki/Polish/Noun_cases + 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca', + 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'], +} + +# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42 +TIMEZONE_NAMES = { + 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0, + 'AST': -4, 'ADT': -3, # Atlantic (used in Canada) + 'EST': -5, 'EDT': -4, # Eastern + 'CST': -6, 'CDT': -5, # Central + 'MST': -7, 'MDT': -6, # Mountain + 'PST': -8, 'PDT': -7 # Pacific +} + +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) + +DATE_FORMATS = ( + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%B %dst %Y', + '%B %dnd %Y', + '%B %drd %Y', + '%B %dth %Y', + '%b %d %Y', + '%b %dst %Y', + '%b %dnd %Y', + '%b %drd %Y', + '%b %dth %Y', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %drd %Y %I:%M', + '%b %dth %Y %I:%M', + '%Y %m %d', + '%Y-%m-%d', + '%Y.%m.%d.', + '%Y/%m/%d', + '%Y/%m/%d %H:%M', + '%Y/%m/%d %H:%M:%S', + '%Y%m%d%H%M', + '%Y%m%d%H%M%S', + '%Y%m%d', + '%Y-%m-%d %H:%M', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%Y-%m-%d %H:%M:%S:%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', + '%b %d %Y at %H:%M', + '%b %d %Y at %H:%M:%S', + '%B %d %Y at %H:%M', + '%B %d %Y at %H:%M:%S', + '%H:%M %d-%b-%Y', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', + '%d-%m-%Y %H:%M', + '%H:%M %d/%m/%Y', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', +]) + +PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" +JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>' + +NUMBER_RE = r'\d+(?:\.\d+)?' + + +@functools.cache +def preferredencoding(): + """Get preferred encoding. + + Returns the best encoding scheme for the system, based on + locale.getpreferredencoding() and some further tweaks. + """ + try: + pref = locale.getpreferredencoding() + 'TEST'.encode(pref) + except Exception: + pref = 'UTF-8' + + return pref + + +def write_json_file(obj, fn): + """ Encode obj as JSON and write it to fn, atomically if possible """ + + tf = tempfile.NamedTemporaryFile( + prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn), + suffix='.tmp', delete=False, mode='w', encoding='utf-8') + + try: + with tf: + json.dump(obj, tf, ensure_ascii=False) + if sys.platform == 'win32': + # Need to remove existing file on Windows, else os.rename raises + # WindowsError or FileExistsError. + with contextlib.suppress(OSError): + os.unlink(fn) + with contextlib.suppress(OSError): + mask = os.umask(0) + os.umask(mask) + os.chmod(tf.name, 0o666 & ~mask) + os.rename(tf.name, fn) + except Exception: + with contextlib.suppress(OSError): + os.remove(tf.name) + raise + + +def find_xpath_attr(node, xpath, key, val=None): + """ Find the xpath xpath[@key=val] """ + assert re.match(r'^[a-zA-Z_-]+$', key) + expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']") + return node.find(expr) + +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter + + +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + + +def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + def _find_xpath(xpath): + return node.find(xpath) + + if isinstance(xpath, str): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break + + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element %s' % name) + else: + return None + return n + + +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + n = xpath_element(node, xpath, name, fatal=fatal, default=default) + if n is None or n == default: + return n + if n.text is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element\'s text %s' % name) + else: + return None + return n.text + + +def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): + n = find_xpath_attr(node, xpath, key) + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = f'{xpath}[@{key}]' if name is None else name + raise ExtractorError('Could not find XML attribute %s' % name) + else: + return None + return n.attrib[key] + + +def get_element_by_id(id, html, **kwargs): + """Return the content of the tag with the specified ID in the passed HTML document""" + return get_element_by_attribute('id', id, html, **kwargs) + + +def get_element_html_by_id(id, html, **kwargs): + """Return the html of the tag with the specified ID in the passed HTML document""" + return get_element_html_by_attribute('id', id, html, **kwargs) + + +def get_element_by_class(class_name, html): + """Return the content of the first tag with the specified class in the passed HTML document""" + retval = get_elements_by_class(class_name, html) + return retval[0] if retval else None + + +def get_element_html_by_class(class_name, html): + """Return the html of the first tag with the specified class in the passed HTML document""" + retval = get_elements_html_by_class(class_name, html) + return retval[0] if retval else None + + +def get_element_by_attribute(attribute, value, html, **kwargs): + retval = get_elements_by_attribute(attribute, value, html, **kwargs) + return retval[0] if retval else None + + +def get_element_html_by_attribute(attribute, value, html, **kargs): + retval = get_elements_html_by_attribute(attribute, value, html, **kargs) + return retval[0] if retval else None + + +def get_elements_by_class(class_name, html, **kargs): + """Return the content of all tags with the specified class in the passed HTML document as a list""" + return get_elements_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_elements_html_by_class(class_name, html): + """Return the html of all tags with the specified class in the passed HTML document as a list""" + return get_elements_html_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_elements_by_attribute(*args, **kwargs): + """Return the content of the tag with the specified attribute in the passed HTML document""" + return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)] + + +def get_elements_html_by_attribute(*args, **kwargs): + """Return the html of the tag with the specified attribute in the passed HTML document""" + return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)] + + +def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): + """ + Return the text (content) and the html (whole) of the tag with the specified + attribute in the passed HTML document + """ + if not value: + return + + quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' + + value = re.escape(value) if escape_value else value + + partial_element_re = rf'''(?x) + <(?P<tag>{tag}) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q) + ''' + + for m in re.finditer(partial_element_re, html): + content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) + + yield ( + unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)), + whole + ) + + +class HTMLBreakOnClosingTagParser(html.parser.HTMLParser): + """ + HTML parser which raises HTMLBreakOnClosingTagException upon reaching the + closing tag for the first opening tag it has encountered, and can be used + as a context manager + """ + + class HTMLBreakOnClosingTagException(Exception): + pass + + def __init__(self): + self.tagstack = collections.deque() + html.parser.HTMLParser.__init__(self) + + def __enter__(self): + return self + + def __exit__(self, *_): + self.close() + + def close(self): + # handle_endtag does not return upon raising HTMLBreakOnClosingTagException, + # so data remains buffered; we no longer have any interest in it, thus + # override this method to discard it + pass + + def handle_starttag(self, tag, _): + self.tagstack.append(tag) + + def handle_endtag(self, tag): + if not self.tagstack: + raise compat_HTMLParseError('no tags in the stack') + while self.tagstack: + inner_tag = self.tagstack.pop() + if inner_tag == tag: + break + else: + raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found') + if not self.tagstack: + raise self.HTMLBreakOnClosingTagException() + + +# XXX: This should be far less strict +def get_element_text_and_html_by_tag(tag, html): + """ + For the first element with the specified tag in the passed HTML document + return its' content (text) and the whole element (html) + """ + def find_or_raise(haystack, needle, exc): + try: + return haystack.index(needle) + except ValueError: + raise exc + closing_tag = f'</{tag}>' + whole_start = find_or_raise( + html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found')) + content_start = find_or_raise( + html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag')) + content_start += whole_start + 1 + with HTMLBreakOnClosingTagParser() as parser: + parser.feed(html[whole_start:content_start]) + if not parser.tagstack or parser.tagstack[0] != tag: + raise compat_HTMLParseError(f'parser did not match opening {tag} tag') + offset = content_start + while offset < len(html): + next_closing_tag_start = find_or_raise( + html[offset:], closing_tag, + compat_HTMLParseError(f'closing {tag} tag not found')) + next_closing_tag_end = next_closing_tag_start + len(closing_tag) + try: + parser.feed(html[offset:offset + next_closing_tag_end]) + offset += next_closing_tag_end + except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: + return html[content_start:offset + next_closing_tag_start], \ + html[whole_start:offset + next_closing_tag_end] + raise compat_HTMLParseError('unexpected end of html') + + +class HTMLAttributeParser(html.parser.HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + + def __init__(self): + self.attrs = {} + html.parser.HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + raise compat_HTMLParseError('done') + + +class HTMLListAttrsParser(html.parser.HTMLParser): + """HTML parser to gather the attributes for the elements of a list""" + + def __init__(self): + html.parser.HTMLParser.__init__(self) + self.items = [] + self._level = 0 + + def handle_starttag(self, tag, attrs): + if tag == 'li' and self._level == 0: + self.items.append(dict(attrs)) + self._level += 1 + + def handle_endtag(self, tag): + self._level -= 1 + + +def extract_attributes(html_element): + """Given a string for an HTML element such as + <el + a="foo" B="bar" c="&98;az" d=boz + empty= noval entity="&" + sq='"' dq="'" + > + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + """ + parser = HTMLAttributeParser() + with contextlib.suppress(compat_HTMLParseError): + parser.feed(html_element) + parser.close() + return parser.attrs + + +def parse_list(webpage): + """Given a string for an series of HTML <li> elements, + return a dictionary of their attributes""" + parser = HTMLListAttrsParser() + parser.feed(webpage) + parser.close() + return parser.items + + +def clean_html(html): + """Clean an HTML snippet into a readable string""" + + if html is None: # Convenience for sanitizing descriptions etc. + return html + + html = re.sub(r'\s+', ' ', html) + html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html) + html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = unescapeHTML(html) + return html.strip() + + +class LenientJSONDecoder(json.JSONDecoder): + # TODO: Write tests + def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs): + self.transform_source, self.ignore_extra = transform_source, ignore_extra + self._close_attempts = 2 * close_objects + super().__init__(*args, **kwargs) + + @staticmethod + def _close_object(err): + doc = err.doc[:err.pos] + # We need to add comma first to get the correct error message + if err.msg.startswith('Expecting \',\''): + return doc + ',' + elif not doc.endswith(','): + return + + if err.msg.startswith('Expecting property name'): + return doc[:-1] + '}' + elif err.msg.startswith('Expecting value'): + return doc[:-1] + ']' + + def decode(self, s): + if self.transform_source: + s = self.transform_source(s) + for attempt in range(self._close_attempts + 1): + try: + if self.ignore_extra: + return self.raw_decode(s.lstrip())[0] + return super().decode(s) + except json.JSONDecodeError as e: + if e.pos is None: + raise + elif attempt < self._close_attempts: + s = self._close_object(e) + if s is not None: + continue + raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos) + assert False, 'Too many attempts to decode JSON' + + +def sanitize_open(filename, open_mode): + """Try to open the given filename, and slightly tweak it if this fails. + + Attempts to open the given filename. If this fails, it tries to change + the filename slightly, step by step, until it's either able to open it + or it fails and raises a final exception, like the standard open() + function. + + It returns the tuple (stream, definitive_file_name). + """ + if filename == '-': + if sys.platform == 'win32': + import msvcrt + + # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout + with contextlib.suppress(io.UnsupportedOperation): + msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) + return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) + + for attempt in range(2): + try: + try: + if sys.platform == 'win32': + # FIXME: An exclusive lock also locks the file from being read. + # Since windows locks are mandatory, don't lock the file on windows (for now). + # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124 + raise LockingUnsupportedError() + stream = locked_file(filename, open_mode, block=False).__enter__() + except OSError: + stream = open(filename, open_mode) + return stream, filename + except OSError as err: + if attempt or err.errno in (errno.EACCES,): + raise + old_filename, filename = filename, sanitize_path(filename) + if old_filename == filename: + raise + + +def timeconvert(timestr): + """Convert RFC 2822 defined time string into system timestamp""" + timestamp = None + timetuple = email.utils.parsedate_tz(timestr) + if timetuple is not None: + timestamp = email.utils.mktime_tz(timetuple) + return timestamp + + +def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): + """Sanitizes a string so it could be used as part of a filename. + @param restricted Use a stricter subset of allowed characters + @param is_id Whether this is an ID that should be kept unchanged if possible. + If unset, yt-dlp's new sanitization rules are in effect + """ + if s == '': + return '' + + def replace_insane(char): + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] + elif not restricted and char == '\n': + return '\0 ' + elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\': + # Replace with their full-width unicode counterparts + return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0)) + elif char == '?' or ord(char) < 32 or ord(char) == 127: + return '' + elif char == '"': + return '' if restricted else '\'' + elif char == ':': + return '\0_\0-' if restricted else '\0 \0-' + elif char in '\\/|*<>': + return '\0_' + if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127): + return '\0_' + return char + + # Replace look-alike Unicode glyphs + if restricted and (is_id is NO_DEFAULT or not is_id): + s = unicodedata.normalize('NFKC', s) + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps + result = ''.join(map(replace_insane, s)) + if is_id is NO_DEFAULT: + result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars + STRIP_RE = r'(?:\0.|[ _-])*' + result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end + result = result.replace('\0', '') or '_' + + if not is_id: + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] + result = result.lstrip('.') + if not result: + result = '_' + return result + + +def sanitize_path(s, force=False): + """Sanitizes and normalizes path on Windows""" + if sys.platform == 'win32': + force = False + drive_or_unc, _ = os.path.splitdrive(s) + elif force: + drive_or_unc = '' + else: + return s + + norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) + if drive_or_unc: + norm_path.pop(0) + sanitized_path = [ + path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) + for path_part in norm_path] + if drive_or_unc: + sanitized_path.insert(0, drive_or_unc + os.path.sep) + elif force and s and s[0] == os.path.sep: + sanitized_path.insert(0, os.path.sep) + return os.path.join(*sanitized_path) + + +def sanitize_url(url, *, scheme='http'): + # Prepend protocol-less URLs with `http:` scheme in order to mitigate + # the number of unwanted failures due to missing protocol + if url is None: + return + elif url.startswith('//'): + return f'{scheme}:{url}' + # Fix some common typos seen so far + COMMON_TYPOS = ( + # https://github.com/ytdl-org/youtube-dl/issues/15649 + (r'^httpss://', r'https://'), + # https://bx1.be/lives/direct-tv/ + (r'^rmtp([es]?)://', r'rtmp\1://'), + ) + for mistake, fixup in COMMON_TYPOS: + if re.match(mistake, url): + return re.sub(mistake, fixup, url) + return url + + +def extract_basic_auth(url): + parts = urllib.parse.urlsplit(url) + if parts.username is None: + return url, None + url = urllib.parse.urlunsplit(parts._replace(netloc=( + parts.hostname if parts.port is None + else '%s:%d' % (parts.hostname, parts.port)))) + auth_payload = base64.b64encode( + ('%s:%s' % (parts.username, parts.password or '')).encode()) + return url, f'Basic {auth_payload.decode()}' + + +def sanitized_Request(url, *args, **kwargs): + url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) + if auth_header is not None: + headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) + headers['Authorization'] = auth_header + return urllib.request.Request(url, *args, **kwargs) + + +def expand_path(s): + """Expand shell variables and ~""" + return os.path.expandvars(compat_expanduser(s)) + + +def orderedSet(iterable, *, lazy=False): + """Remove all duplicates from the input iterable""" + def _iter(): + seen = [] # Do not use set since the items can be unhashable + for x in iterable: + if x not in seen: + seen.append(x) + yield x + + return _iter() if lazy else list(_iter()) + + +def _htmlentity_transform(entity_with_semicolon): + """Transforms an HTML entity to a character.""" + entity = entity_with_semicolon[:-1] + + # Known non-numeric HTML entity + if entity in html.entities.name2codepoint: + return chr(html.entities.name2codepoint[entity]) + + # TODO: HTML5 allows entities without a semicolon. + # E.g. 'Éric' should be decoded as 'Éric'. + if entity_with_semicolon in html.entities.html5: + return html.entities.html5[entity_with_semicolon] + + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith('x'): + base = 16 + numstr = '0%s' % numstr + else: + base = 10 + # See https://github.com/ytdl-org/youtube-dl/issues/7518 + with contextlib.suppress(ValueError): + return chr(int(numstr, base)) + + # Unknown entity in name, return its literal representation + return '&%s;' % entity + + +def unescapeHTML(s): + if s is None: + return None + assert isinstance(s, str) + + return re.sub( + r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + + +def escapeHTML(text): + return ( + text + .replace('&', '&') + .replace('<', '<') + .replace('>', '>') + .replace('"', '"') + .replace("'", ''') + ) + + +class netrc_from_content(netrc.netrc): + def __init__(self, content): + self.hosts, self.macros = {}, {} + with io.StringIO(content) as stream: + self._parse('-', stream, False) + + +class Popen(subprocess.Popen): + if sys.platform == 'win32': + _startupinfo = subprocess.STARTUPINFO() + _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + else: + _startupinfo = None + + @staticmethod + def _fix_pyinstaller_ld_path(env): + """Restore LD_LIBRARY_PATH when using PyInstaller + Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations + https://github.com/yt-dlp/yt-dlp/issues/4573 + """ + if not hasattr(sys, '_MEIPASS'): + return + + def _fix(key): + orig = env.get(f'{key}_ORIG') + if orig is None: + env.pop(key, None) + else: + env[key] = orig + + _fix('LD_LIBRARY_PATH') # Linux + _fix('DYLD_LIBRARY_PATH') # macOS + + def __init__(self, *args, env=None, text=False, **kwargs): + if env is None: + env = os.environ.copy() + self._fix_pyinstaller_ld_path(env) + + self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines') + if text is True: + kwargs['universal_newlines'] = True # For 3.6 compatibility + kwargs.setdefault('encoding', 'utf-8') + kwargs.setdefault('errors', 'replace') + super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo) + + def communicate_or_kill(self, *args, **kwargs): + try: + return self.communicate(*args, **kwargs) + except BaseException: # Including KeyboardInterrupt + self.kill(timeout=None) + raise + + def kill(self, *, timeout=0): + super().kill() + if timeout != 0: + self.wait(timeout=timeout) + + @classmethod + def run(cls, *args, timeout=None, **kwargs): + with cls(*args, **kwargs) as proc: + default = '' if proc.__text_mode else b'' + stdout, stderr = proc.communicate_or_kill(timeout=timeout) + return stdout or default, stderr or default, proc.returncode + + +def encodeArgument(s): + # Legacy code that uses byte strings + # Uncomment the following line after fixing all post processors + # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s)) + return s if isinstance(s, str) else s.decode('ascii') + + +_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds')) + + +def timetuple_from_msec(msec): + secs, msec = divmod(msec, 1000) + mins, secs = divmod(secs, 60) + hrs, mins = divmod(mins, 60) + return _timetuple(hrs, mins, secs, msec) + + +def formatSeconds(secs, delim=':', msec=False): + time = timetuple_from_msec(secs * 1000) + if time.hours: + ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds) + elif time.minutes: + ret = '%d%s%02d' % (time.minutes, delim, time.seconds) + else: + ret = '%d' % time.seconds + return '%s.%03d' % (ret, time.milliseconds) if msec else ret + + +def _ssl_load_windows_store_certs(ssl_context, storename): + # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py + try: + certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename) + if encoding == 'x509_asn' and ( + trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)] + except PermissionError: + return + for cert in certs: + with contextlib.suppress(ssl.SSLError): + ssl_context.load_verify_locations(cadata=cert) + + +def make_HTTPS_handler(params, **kwargs): + opts_check_certificate = not params.get('nocheckcertificate') + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context.check_hostname = opts_check_certificate + if params.get('legacyserverconnect'): + context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT + # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998 + context.set_ciphers('DEFAULT') + elif ( + sys.version_info < (3, 10) + and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1) + and not ssl.OPENSSL_VERSION.startswith('LibreSSL') + ): + # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1]. + # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting + # in some situations [2][3]. + # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely + # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe. + # LibreSSL is excluded until further investigation due to cipher support issues [5][6]. + # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536 + # 2. https://github.com/yt-dlp/yt-dlp/issues/4627 + # 3. https://github.com/yt-dlp/yt-dlp/pull/5294 + # 4. https://peps.python.org/pep-0644/ + # 5. https://peps.python.org/pep-0644/#libressl-support + # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368 + context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM') + context.minimum_version = ssl.TLSVersion.TLSv1_2 + + context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE + if opts_check_certificate: + if certifi and 'no-certifi' not in params.get('compat_opts', []): + context.load_verify_locations(cafile=certifi.where()) + else: + try: + context.load_default_certs() + # Work around the issue in load_default_certs when there are bad certificates. See: + # https://github.com/yt-dlp/yt-dlp/issues/1060, + # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 + except ssl.SSLError: + # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 + if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): + for storename in ('CA', 'ROOT'): + _ssl_load_windows_store_certs(context, storename) + context.set_default_verify_paths() + + client_certfile = params.get('client_certificate') + if client_certfile: + try: + context.load_cert_chain( + client_certfile, keyfile=params.get('client_certificate_key'), + password=params.get('client_certificate_password')) + except ssl.SSLError: + raise YoutubeDLError('Unable to load client certificate') + + # Some servers may reject requests if ALPN extension is not sent. See: + # https://github.com/python/cpython/issues/85140 + # https://github.com/yt-dlp/yt-dlp/issues/3878 + with contextlib.suppress(NotImplementedError): + context.set_alpn_protocols(['http/1.1']) + + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + + +def bug_reports_message(before=';'): + from ..update import REPOSITORY + + msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , ' + 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U') + + before = before.rstrip() + if not before or before.endswith(('.', '!', '?')): + msg = msg[0].title() + msg[1:] + + return (before + ' ' if before else '') + msg + + +class YoutubeDLError(Exception): + """Base exception for YoutubeDL errors.""" + msg = None + + def __init__(self, msg=None): + if msg is not None: + self.msg = msg + elif self.msg is None: + self.msg = type(self).__name__ + super().__init__(self.msg) + + +network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error] +if hasattr(ssl, 'CertificateError'): + network_exceptions.append(ssl.CertificateError) +network_exceptions = tuple(network_exceptions) + + +class ExtractorError(YoutubeDLError): + """Error during info extraction.""" + + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None): + """ tb, if given, is the original traceback (so that it can be printed out). + If expected is set, this is a normal error message and most likely not a bug in yt-dlp. + """ + if sys.exc_info()[0] in network_exceptions: + expected = True + + self.orig_msg = str(msg) + self.traceback = tb + self.expected = expected + self.cause = cause + self.video_id = video_id + self.ie = ie + self.exc_info = sys.exc_info() # preserve original exception + if isinstance(self.exc_info[1], ExtractorError): + self.exc_info = self.exc_info[1].exc_info + super().__init__(self.__msg) + + @property + def __msg(self): + return ''.join(( + format_field(self.ie, None, '[%s] '), + format_field(self.video_id, None, '%s: '), + self.orig_msg, + format_field(self.cause, None, ' (caused by %r)'), + '' if self.expected else bug_reports_message())) + + def format_traceback(self): + return join_nonempty( + self.traceback and ''.join(traceback.format_tb(self.traceback)), + self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]), + delim='\n') or None + + def __setattr__(self, name, value): + super().__setattr__(name, value) + if getattr(self, 'msg', None) and name not in ('msg', 'args'): + self.msg = self.__msg or type(self).__name__ + self.args = (self.msg, ) # Cannot be property + + +class UnsupportedError(ExtractorError): + def __init__(self, url): + super().__init__( + 'Unsupported URL: %s' % url, expected=True) + self.url = url + + +class RegexNotFoundError(ExtractorError): + """Error when a regex didn't match""" + pass + + +class GeoRestrictedError(ExtractorError): + """Geographic restriction Error exception. + + This exception may be thrown when a video is not available from your + geographic location due to geographic restrictions imposed by a website. + """ + + def __init__(self, msg, countries=None, **kwargs): + kwargs['expected'] = True + super().__init__(msg, **kwargs) + self.countries = countries + + +class UserNotLive(ExtractorError): + """Error when a channel/user is not live""" + + def __init__(self, msg=None, **kwargs): + kwargs['expected'] = True + super().__init__(msg or 'The channel is not currently live', **kwargs) + + +class DownloadError(YoutubeDLError): + """Download Error exception. + + This exception may be thrown by FileDownloader objects if they are not + configured to continue on errors. They will contain the appropriate + error message. + """ + + def __init__(self, msg, exc_info=None): + """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ + super().__init__(msg) + self.exc_info = exc_info + + +class EntryNotInPlaylist(YoutubeDLError): + """Entry not in playlist exception. + + This exception will be thrown by YoutubeDL when a requested entry + is not found in the playlist info_dict + """ + msg = 'Entry not found in info' + + +class SameFileError(YoutubeDLError): + """Same File exception. + + This exception will be thrown by FileDownloader objects if they detect + multiple files would have to be downloaded to the same file on disk. + """ + msg = 'Fixed output name but more than one file to download' + + def __init__(self, filename=None): + if filename is not None: + self.msg += f': {filename}' + super().__init__(self.msg) + + +class PostProcessingError(YoutubeDLError): + """Post Processing exception. + + This exception may be raised by PostProcessor's .run() method to + indicate an error in the postprocessing task. + """ + + +class DownloadCancelled(YoutubeDLError): + """ Exception raised when the download queue should be interrupted """ + msg = 'The download was cancelled' + + +class ExistingVideoReached(DownloadCancelled): + """ --break-on-existing triggered """ + msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing' + + +class RejectedVideoReached(DownloadCancelled): + """ --break-match-filter triggered """ + msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter' + + +class MaxDownloadsReached(DownloadCancelled): + """ --max-downloads limit has been reached. """ + msg = 'Maximum number of downloads reached, stopping due to --max-downloads' + + +class ReExtractInfo(YoutubeDLError): + """ Video info needs to be re-extracted. """ + + def __init__(self, msg, expected=False): + super().__init__(msg) + self.expected = expected + + +class ThrottledDownload(ReExtractInfo): + """ Download speed below --throttled-rate. """ + msg = 'The download speed is below throttle limit' + + def __init__(self): + super().__init__(self.msg, expected=False) + + +class UnavailableVideoError(YoutubeDLError): + """Unavailable Format exception. + + This exception will be thrown when a video is requested + in a format that is not available for that video. + """ + msg = 'Unable to download video' + + def __init__(self, err=None): + if err is not None: + self.msg += f': {err}' + super().__init__(self.msg) + + +class ContentTooShortError(YoutubeDLError): + """Content Too Short exception. + + This exception may be raised by FileDownloader objects when a file they + download is too small for what the server announced first, indicating + the connection was probably interrupted. + """ + + def __init__(self, downloaded, expected): + super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes') + # Both in bytes + self.downloaded = downloaded + self.expected = expected + + +class XAttrMetadataError(YoutubeDLError): + def __init__(self, code=None, msg='Unknown error'): + super().__init__(msg) + self.code = code + self.msg = msg + + # Parsing code and msg + if (self.code in (errno.ENOSPC, errno.EDQUOT) + or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg): + self.reason = 'NO_SPACE' + elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: + self.reason = 'VALUE_TOO_LONG' + else: + self.reason = 'NOT_SUPPORTED' + + +class XAttrUnavailableError(YoutubeDLError): + pass + + +def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + hc = http_class(*args, **kwargs) + source_address = ydl_handler._params.get('source_address') + + if source_address is not None: + # This is to workaround _create_connection() from socket where it will try all + # address data from getaddrinfo() including IPv6. This filters the result from + # getaddrinfo() based on the source_address value. + # This is based on the cpython socket.create_connection() function. + # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 + def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + host, port = address + err = None + addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in addrs if addr[0] == af] + if addrs and not ip_addrs: + ip_version = 'v4' if af == socket.AF_INET else 'v6' + raise OSError( + "No remote IP%s addresses available for connect, can't use '%s' as source address" + % (ip_version, source_address[0])) + for res in ip_addrs: + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + sock.bind(source_address) + sock.connect(sa) + err = None # Explicitly break reference cycle + return sock + except OSError as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise OSError('getaddrinfo returns an empty list') + if hasattr(hc, '_create_connection'): + hc._create_connection = _create_connection + hc.source_address = (source_address, 0) + + return hc + + +class YoutubeDLHandler(urllib.request.HTTPHandler): + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped, deflated and + brotli responses from web servers. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + def __init__(self, params, *args, **kwargs): + urllib.request.HTTPHandler.__init__(self, *args, **kwargs) + self._params = params + + def http_open(self, req): + conn_class = http.client.HTTPConnection + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + + return self.do_open(functools.partial( + _create_http_connection, self, conn_class, False), + req) + + @staticmethod + def deflate(data): + if not data: + return data + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + @staticmethod + def brotli(data): + if not data: + return data + return brotli.decompress(data) + + @staticmethod + def gz(data): + gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') + try: + return gz.read() + except OSError as original_oserror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb') + return gz.read() + except OSError: + continue + else: + raise original_oserror + + def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req = update_Request(req, url=url_escaped) + + for h, v in self._params.get('http_headers', std_headers).items(): + # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 + # The dict keys are capitalized because of this bug by urllib + if h.capitalize() not in req.headers: + req.add_header(h, v) + + if 'Youtubedl-no-compression' in req.headers: # deprecated + req.headers.pop('Youtubedl-no-compression', None) + req.add_header('Accept-encoding', 'identity') + + if 'Accept-encoding' not in req.headers: + req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS)) + + return super().do_request_(req) + + def http_response(self, req, resp): + old_resp = resp + + # Content-Encoding header lists the encodings in order that they were applied [1]. + # To decompress, we simply do the reverse. + # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding + decoded_response = None + for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): + if encoding == 'gzip': + decoded_response = self.gz(decoded_response or resp.read()) + elif encoding == 'deflate': + decoded_response = self.deflate(decoded_response or resp.read()) + elif encoding == 'br' and brotli: + decoded_response = self.brotli(decoded_response or resp.read()) + + if decoded_response is not None: + resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/ytdl-org/youtube-dl/issues/6457). + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + location = location.encode('iso-8859-1').decode() + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped + return resp + + https_request = http_request + https_response = http_response + + +def make_socks_conn_class(base_class, socks_proxy): + assert issubclass(base_class, ( + http.client.HTTPConnection, http.client.HTTPSConnection)) + + url_components = urllib.parse.urlparse(socks_proxy) + if url_components.scheme.lower() == 'socks5': + socks_type = ProxyType.SOCKS5 + elif url_components.scheme.lower() in ('socks', 'socks4'): + socks_type = ProxyType.SOCKS4 + elif url_components.scheme.lower() == 'socks4a': + socks_type = ProxyType.SOCKS4A + + def unquote_if_non_empty(s): + if not s: + return s + return urllib.parse.unquote_plus(s) + + proxy_args = ( + socks_type, + url_components.hostname, url_components.port or 1080, + True, # Remote DNS + unquote_if_non_empty(url_components.username), + unquote_if_non_empty(url_components.password), + ) + + class SocksConnection(base_class): + def connect(self): + self.sock = sockssocket() + self.sock.setproxy(*proxy_args) + if isinstance(self.timeout, (int, float)): + self.sock.settimeout(self.timeout) + self.sock.connect((self.host, self.port)) + + if isinstance(self, http.client.HTTPSConnection): + if hasattr(self, '_context'): # Python > 2.6 + self.sock = self._context.wrap_socket( + self.sock, server_hostname=self.host) + else: + self.sock = ssl.wrap_socket(self.sock) + + return SocksConnection + + +class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler): + def __init__(self, params, https_conn_class=None, *args, **kwargs): + urllib.request.HTTPSHandler.__init__(self, *args, **kwargs) + self._https_conn_class = https_conn_class or http.client.HTTPSConnection + self._params = params + + def https_open(self, req): + kwargs = {} + conn_class = self._https_conn_class + + if hasattr(self, '_context'): # python > 2.6 + kwargs['context'] = self._context + if hasattr(self, '_check_hostname'): # python 3.x + kwargs['check_hostname'] = self._check_hostname + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + + try: + return self.do_open( + functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs) + except urllib.error.URLError as e: + if (isinstance(e.reason, ssl.SSLError) + and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'): + raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect') + raise + + +def is_path_like(f): + return isinstance(f, (str, bytes, os.PathLike)) + + +class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor): + def __init__(self, cookiejar=None): + urllib.request.HTTPCookieProcessor.__init__(self, cookiejar) + + def http_response(self, request, response): + return urllib.request.HTTPCookieProcessor.http_response(self, request, response) + + https_request = urllib.request.HTTPCookieProcessor.http_request + https_response = http_response + + +class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler): + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler fixes and improves the logic to better align with RFC7261 + and what browsers tend to do [2][3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://datatracker.ietf.org/doc/html/rfc7231 + 3. https://github.com/python/cpython/issues/91306 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + if code not in (301, 302, 303, 307, 308): + raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) + + new_method = req.get_method() + new_data = req.data + remove_headers = [] + # A 303 must either use GET or HEAD for subsequent request + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 + if code == 303 and req.get_method() != 'HEAD': + new_method = 'GET' + # 301 and 302 redirects are commonly turned into a GET from a POST + # for subsequent requests by browsers, so we'll do the same. + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2 + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3 + elif code in (301, 302) and req.get_method() == 'POST': + new_method = 'GET' + + # only remove payload if method changed (e.g. POST to GET) + if new_method != req.get_method(): + new_data = None + remove_headers.extend(['Content-Length', 'Content-Type']) + + new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers} + + return urllib.request.Request( + newurl, headers=new_headers, origin_req_host=req.origin_req_host, + unverifiable=True, method=new_method, data=new_data) + + +def extract_timezone(date_str): + m = re.search( + r'''(?x) + ^.{8,}? # >=8 char non-TZ prefix, if present + (?P<tz>Z| # just the UTC Z, or + (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or + (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits + [ ]? # optional space + (?P<sign>\+|-) # +/- + (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm + $) + ''', date_str) + if not m: + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) + if timezone is not None: + date_str = date_str[:-len(m.group('tz'))] + timezone = datetime.timedelta(hours=timezone or 0) + else: + date_str = date_str[:-len(m.group('tz'))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + return timezone, date_str + + +def parse_iso8601(date_str, delimiter='T', timezone=None): + """ Return a UNIX timestamp from the given date """ + + if date_str is None: + return None + + date_str = re.sub(r'\.[0-9]+', '', date_str) + + if timezone is None: + timezone, date_str = extract_timezone(date_str) + + with contextlib.suppress(ValueError): + date_format = f'%Y-%m-%d{delimiter}%H:%M:%S' + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + + +def date_formats(day_first=True): + return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + + +def unified_strdate(date_str, day_first=True): + """Return a string with the date in the format YYYYMMDD""" + + if date_str is None: + return None + upload_date = None + # Replace commas + date_str = date_str.replace(',', ' ') + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + _, date_str = extract_timezone(date_str) + + for expression in date_formats(day_first): + with contextlib.suppress(ValueError): + upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') + if upload_date is None: + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + with contextlib.suppress(ValueError): + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + if upload_date is not None: + return str(upload_date) + + +def unified_timestamp(date_str, day_first=True): + if not isinstance(date_str, str): + return None + + date_str = re.sub(r'\s+', ' ', re.sub( + r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str)) + + pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 + timezone, date_str = extract_timezone(date_str) + + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + # Remove unrecognized timezones from ISO 8601 alike timestamps + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + if m: + date_str = date_str[:-len(m.group('tz'))] + + # Python only supports microseconds, so remove nanoseconds + m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) + if m: + date_str = m.group(1) + + for expression in date_formats(day_first): + with contextlib.suppress(ValueError): + dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) + return calendar.timegm(dt.timetuple()) + + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() + + +def determine_ext(url, default_ext='unknown_video'): + if url is None or '.' not in url: + return default_ext + guess = url.partition('?')[0].rpartition('.')[2] + if re.match(r'^[A-Za-z0-9]+$', guess): + return guess + # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download + elif guess.rstrip('/') in KNOWN_EXTENSIONS: + return guess.rstrip('/') + else: + return default_ext + + +def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): + return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) + + +def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): + R""" + Return a datetime object from a string. + Supported format: + (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)? + + @param format strftime format of DATE + @param precision Round the datetime object: auto|microsecond|second|minute|hour|day + auto: round to the unit provided in date_str (if applicable). + """ + auto_precision = False + if precision == 'auto': + auto_precision = True + precision = 'microsecond' + today = datetime_round(datetime.datetime.utcnow(), precision) + if date_str in ('now', 'today'): + return today + if date_str == 'yesterday': + return today - datetime.timedelta(days=1) + match = re.match( + r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?', + date_str) + if match is not None: + start_time = datetime_from_str(match.group('start'), precision, format) + time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1) + unit = match.group('unit') + if unit == 'month' or unit == 'year': + new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time) + unit = 'day' + else: + if unit == 'week': + unit = 'day' + time *= 7 + delta = datetime.timedelta(**{unit + 's': time}) + new_date = start_time + delta + if auto_precision: + return datetime_round(new_date, unit) + return new_date + + return datetime_round(datetime.datetime.strptime(date_str, format), precision) + + +def date_from_str(date_str, format='%Y%m%d', strict=False): + R""" + Return a date object from a string using datetime_from_str + + @param strict Restrict allowed patterns to "YYYYMMDD" and + (now|today|yesterday)(-\d+(day|week|month|year)s?)? + """ + if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str): + raise ValueError(f'Invalid date format "{date_str}"') + return datetime_from_str(date_str, precision='microsecond', format=format).date() + + +def datetime_add_months(dt, months): + """Increment/Decrement a datetime object by months.""" + month = dt.month + months - 1 + year = dt.year + month // 12 + month = month % 12 + 1 + day = min(dt.day, calendar.monthrange(year, month)[1]) + return dt.replace(year, month, day) + + +def datetime_round(dt, precision='day'): + """ + Round a datetime object's time to a specific precision + """ + if precision == 'microsecond': + return dt + + unit_seconds = { + 'day': 86400, + 'hour': 3600, + 'minute': 60, + 'second': 1, + } + roundto = lambda x, n: ((x + n / 2) // n) * n + timestamp = calendar.timegm(dt.timetuple()) + return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision])) + + +def hyphenate_date(date_str): + """ + Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format""" + match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str) + if match is not None: + return '-'.join(match.groups()) + else: + return date_str + + +class DateRange: + """Represents a time interval between two dates""" + + def __init__(self, start=None, end=None): + """start and end must be strings in the format accepted by date""" + if start is not None: + self.start = date_from_str(start, strict=True) + else: + self.start = datetime.datetime.min.date() + if end is not None: + self.end = date_from_str(end, strict=True) + else: + self.end = datetime.datetime.max.date() + if self.start > self.end: + raise ValueError('Date range: "%s" , the start date must be before the end date' % self) + + @classmethod + def day(cls, day): + """Returns a range that only contains the given day""" + return cls(day, day) + + def __contains__(self, date): + """Check if the date is in the range""" + if not isinstance(date, datetime.date): + date = date_from_str(date) + return self.start <= date <= self.end + + def __repr__(self): + return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})' + + def __eq__(self, other): + return (isinstance(other, DateRange) + and self.start == other.start and self.end == other.end) + + +@functools.cache +def system_identifier(): + python_implementation = platform.python_implementation() + if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'): + python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3] + libc_ver = [] + with contextlib.suppress(OSError): # We may not have access to the executable + libc_ver = platform.libc_ver() + + return 'Python %s (%s %s %s) - %s (%s%s)' % ( + platform.python_version(), + python_implementation, + platform.machine(), + platform.architecture()[0], + platform.platform(), + ssl.OPENSSL_VERSION, + format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'), + ) + + +@functools.cache +def get_windows_version(): + ''' Get Windows version. returns () if it's not running on Windows ''' + if compat_os_name == 'nt': + return version_tuple(platform.win32_ver()[1]) + else: + return () + + +def write_string(s, out=None, encoding=None): + assert isinstance(s, str) + out = out or sys.stderr + # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217) + if not out: + return + + if compat_os_name == 'nt' and supports_terminal_sequences(out): + s = re.sub(r'([\r\n]+)', r' \1', s) + + enc, buffer = None, out + if 'b' in getattr(out, 'mode', ''): + enc = encoding or preferredencoding() + elif hasattr(out, 'buffer'): + buffer = out.buffer + enc = encoding or getattr(out, 'encoding', None) or preferredencoding() + + buffer.write(s.encode(enc, 'ignore') if enc else s) + out.flush() + + +def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): + from .. import _IN_CLI + if _IN_CLI: + if msg in deprecation_warning._cache: + return + deprecation_warning._cache.add(msg) + if printer: + return printer(f'{msg}{bug_reports_message()}', **kwargs) + return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs) + else: + import warnings + warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3) + + +deprecation_warning._cache = set() + + +def bytes_to_intlist(bs): + if not bs: + return [] + if isinstance(bs[0], int): # Python 3 + return list(bs) + else: + return [ord(c) for c in bs] + + +def intlist_to_bytes(xs): + if not xs: + return b'' + return struct.pack('%dB' % len(xs), *xs) + + +class LockingUnsupportedError(OSError): + msg = 'File locking is not supported' + + def __init__(self): + super().__init__(self.msg) + + +# Cross-platform file locking +if sys.platform == 'win32': + import ctypes + import ctypes.wintypes + import msvcrt + + class OVERLAPPED(ctypes.Structure): + _fields_ = [ + ('Internal', ctypes.wintypes.LPVOID), + ('InternalHigh', ctypes.wintypes.LPVOID), + ('Offset', ctypes.wintypes.DWORD), + ('OffsetHigh', ctypes.wintypes.DWORD), + ('hEvent', ctypes.wintypes.HANDLE), + ] + + kernel32 = ctypes.WinDLL('kernel32') + LockFileEx = kernel32.LockFileEx + LockFileEx.argtypes = [ + ctypes.wintypes.HANDLE, # hFile + ctypes.wintypes.DWORD, # dwFlags + ctypes.wintypes.DWORD, # dwReserved + ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow + ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh + ctypes.POINTER(OVERLAPPED) # Overlapped + ] + LockFileEx.restype = ctypes.wintypes.BOOL + UnlockFileEx = kernel32.UnlockFileEx + UnlockFileEx.argtypes = [ + ctypes.wintypes.HANDLE, # hFile + ctypes.wintypes.DWORD, # dwReserved + ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow + ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh + ctypes.POINTER(OVERLAPPED) # Overlapped + ] + UnlockFileEx.restype = ctypes.wintypes.BOOL + whole_low = 0xffffffff + whole_high = 0x7fffffff + + def _lock_file(f, exclusive, block): + overlapped = OVERLAPPED() + overlapped.Offset = 0 + overlapped.OffsetHigh = 0 + overlapped.hEvent = 0 + f._lock_file_overlapped_p = ctypes.pointer(overlapped) + + if not LockFileEx(msvcrt.get_osfhandle(f.fileno()), + (0x2 if exclusive else 0x0) | (0x0 if block else 0x1), + 0, whole_low, whole_high, f._lock_file_overlapped_p): + # NB: No argument form of "ctypes.FormatError" does not work on PyPy + raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}') + + def _unlock_file(f): + assert f._lock_file_overlapped_p + handle = msvcrt.get_osfhandle(f.fileno()) + if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p): + raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) + +else: + try: + import fcntl + + def _lock_file(f, exclusive, block): + flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH + if not block: + flags |= fcntl.LOCK_NB + try: + fcntl.flock(f, flags) + except BlockingIOError: + raise + except OSError: # AOSP does not have flock() + fcntl.lockf(f, flags) + + def _unlock_file(f): + with contextlib.suppress(OSError): + return fcntl.flock(f, fcntl.LOCK_UN) + with contextlib.suppress(OSError): + return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock() + return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking + + except ImportError: + + def _lock_file(f, exclusive, block): + raise LockingUnsupportedError() + + def _unlock_file(f): + raise LockingUnsupportedError() + + +class locked_file: + locked = False + + def __init__(self, filename, mode, block=True, encoding=None): + if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}: + raise NotImplementedError(mode) + self.mode, self.block = mode, block + + writable = any(f in mode for f in 'wax+') + readable = any(f in mode for f in 'r+') + flags = functools.reduce(operator.ior, ( + getattr(os, 'O_CLOEXEC', 0), # UNIX only + getattr(os, 'O_BINARY', 0), # Windows only + getattr(os, 'O_NOINHERIT', 0), # Windows only + os.O_CREAT if writable else 0, # O_TRUNC only after locking + os.O_APPEND if 'a' in mode else 0, + os.O_EXCL if 'x' in mode else 0, + os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY, + )) + + self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding) + + def __enter__(self): + exclusive = 'r' not in self.mode + try: + _lock_file(self.f, exclusive, self.block) + self.locked = True + except OSError: + self.f.close() + raise + if 'w' in self.mode: + try: + self.f.truncate() + except OSError as e: + if e.errno not in ( + errno.ESPIPE, # Illegal seek - expected for FIFO + errno.EINVAL, # Invalid argument - expected for /dev/null + ): + raise + return self + + def unlock(self): + if not self.locked: + return + try: + _unlock_file(self.f) + finally: + self.locked = False + + def __exit__(self, *_): + try: + self.unlock() + finally: + self.f.close() + + open = __enter__ + close = __exit__ + + def __getattr__(self, attr): + return getattr(self.f, attr) + + def __iter__(self): + return iter(self.f) + + +@functools.cache +def get_filesystem_encoding(): + encoding = sys.getfilesystemencoding() + return encoding if encoding is not None else 'utf-8' + + +def shell_quote(args): + quoted_args = [] + encoding = get_filesystem_encoding() + for a in args: + if isinstance(a, bytes): + # We may get a filename encoded with 'encodeFilename' + a = a.decode(encoding) + quoted_args.append(compat_shlex_quote(a)) + return ' '.join(quoted_args) + + +def smuggle_url(url, data): + """ Pass additional data in a URL for internal use. """ + + url, idata = unsmuggle_url(url, {}) + data.update(idata) + sdata = urllib.parse.urlencode( + {'__youtubedl_smuggle': json.dumps(data)}) + return url + '#' + sdata + + +def unsmuggle_url(smug_url, default=None): + if '#__youtubedl_smuggle' not in smug_url: + return smug_url, default + url, _, sdata = smug_url.rpartition('#') + jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0] + data = json.loads(jsond) + return url, data + + +def format_decimal_suffix(num, fmt='%d%s', *, factor=1000): + """ Formats numbers with decimal sufixes like K, M, etc """ + num, factor = float_or_none(num), float(factor) + if num is None or num < 0: + return None + POSSIBLE_SUFFIXES = 'kMGTPEZY' + exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES)) + suffix = ['', *POSSIBLE_SUFFIXES][exponent] + if factor == 1024: + suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i') + converted = num / (factor ** exponent) + return fmt % (converted, suffix) + + +def format_bytes(bytes): + return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A' + + +def lookup_unit_table(unit_table, s, strict=False): + num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]') + units_re = '|'.join(re.escape(u) for u in unit_table) + m = (re.fullmatch if strict else re.match)( + rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s) + if not m: + return None + + num = float(m.group('num').replace(',', '.')) + mult = unit_table[m.group('unit')] + return round(num * mult) + + +def parse_bytes(s): + """Parse a string indicating a byte quantity into an integer""" + return lookup_unit_table( + {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])}, + s.upper(), strict=True) + + +def parse_filesize(s): + if s is None: + return None + + # The lower-case forms are of course incorrect and unofficial, + # but we support those too + _UNIT_TABLE = { + 'B': 1, + 'b': 1, + 'bytes': 1, + 'KiB': 1024, + 'KB': 1000, + 'kB': 1024, + 'Kb': 1000, + 'kb': 1000, + 'kilobytes': 1000, + 'kibibytes': 1024, + 'MiB': 1024 ** 2, + 'MB': 1000 ** 2, + 'mB': 1024 ** 2, + 'Mb': 1000 ** 2, + 'mb': 1000 ** 2, + 'megabytes': 1000 ** 2, + 'mebibytes': 1024 ** 2, + 'GiB': 1024 ** 3, + 'GB': 1000 ** 3, + 'gB': 1024 ** 3, + 'Gb': 1000 ** 3, + 'gb': 1000 ** 3, + 'gigabytes': 1000 ** 3, + 'gibibytes': 1024 ** 3, + 'TiB': 1024 ** 4, + 'TB': 1000 ** 4, + 'tB': 1024 ** 4, + 'Tb': 1000 ** 4, + 'tb': 1000 ** 4, + 'terabytes': 1000 ** 4, + 'tebibytes': 1024 ** 4, + 'PiB': 1024 ** 5, + 'PB': 1000 ** 5, + 'pB': 1024 ** 5, + 'Pb': 1000 ** 5, + 'pb': 1000 ** 5, + 'petabytes': 1000 ** 5, + 'pebibytes': 1024 ** 5, + 'EiB': 1024 ** 6, + 'EB': 1000 ** 6, + 'eB': 1024 ** 6, + 'Eb': 1000 ** 6, + 'eb': 1000 ** 6, + 'exabytes': 1000 ** 6, + 'exbibytes': 1024 ** 6, + 'ZiB': 1024 ** 7, + 'ZB': 1000 ** 7, + 'zB': 1024 ** 7, + 'Zb': 1000 ** 7, + 'zb': 1000 ** 7, + 'zettabytes': 1000 ** 7, + 'zebibytes': 1024 ** 7, + 'YiB': 1024 ** 8, + 'YB': 1000 ** 8, + 'yB': 1024 ** 8, + 'Yb': 1000 ** 8, + 'yb': 1000 ** 8, + 'yottabytes': 1000 ** 8, + 'yobibytes': 1024 ** 8, + } + + return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_count(s): + if s is None: + return None + + s = re.sub(r'^[^\d]+\s', '', s).strip() + + if re.match(r'^[\d,.]+$', s): + return str_to_int(s) + + _UNIT_TABLE = { + 'k': 1000, + 'K': 1000, + 'm': 1000 ** 2, + 'M': 1000 ** 2, + 'kk': 1000 ** 2, + 'KK': 1000 ** 2, + 'b': 1000 ** 3, + 'B': 1000 ** 3, + } + + ret = lookup_unit_table(_UNIT_TABLE, s) + if ret is not None: + return ret + + mobj = re.match(r'([\d,.]+)(?:$|\s)', s) + if mobj: + return str_to_int(mobj.group(1)) + + +def parse_resolution(s, *, lenient=False): + if s is None: + return {} + + if lenient: + mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s) + else: + mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s) + if mobj: + return { + 'width': int(mobj.group('w')), + 'height': int(mobj.group('h')), + } + + mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s) + if mobj: + return {'height': int(mobj.group(1))} + + mobj = re.search(r'\b([48])[kK]\b', s) + if mobj: + return {'height': int(mobj.group(1)) * 540} + + return {} + + +def parse_bitrate(s): + if not isinstance(s, str): + return + mobj = re.search(r'\b(\d+)\s*kbps', s) + if mobj: + return int(mobj.group(1)) + + +def month_by_name(name, lang='en'): + """ Return the number of a month by (locale-independently) English name """ + + month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en']) + + try: + return month_names.index(name) + 1 + except ValueError: + return None + + +def month_by_abbreviation(abbrev): + """ Return the number of a month by (locale-independently) English + abbreviations """ + + try: + return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1 + except ValueError: + return None + + +def fix_xml_ampersands(xml_str): + """Replace all the '&' by '&' in XML""" + return re.sub( + r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', + '&', + xml_str) + + +def setproctitle(title): + assert isinstance(title, str) + + # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541 + try: + import ctypes + except ImportError: + return + + try: + libc = ctypes.cdll.LoadLibrary('libc.so.6') + except OSError: + return + except TypeError: + # LoadLibrary in Windows Python 2.7.13 only expects + # a bytestring, but since unicode_literals turns + # every string into a unicode string, it fails. + return + title_bytes = title.encode() + buf = ctypes.create_string_buffer(len(title_bytes)) + buf.value = title_bytes + try: + libc.prctl(15, buf, 0, 0, 0) + except AttributeError: + return # Strange libc, just skip this + + +def remove_start(s, start): + return s[len(start):] if s is not None and s.startswith(start) else s + + +def remove_end(s, end): + return s[:-len(end)] if s is not None and s.endswith(end) else s + + +def remove_quotes(s): + if s is None or len(s) < 2: + return s + for quote in ('"', "'", ): + if s[0] == quote and s[-1] == quote: + return s[1:-1] + return s + + +def get_domain(url): + """ + This implementation is inconsistent, but is kept for compatibility. + Use this only for "webpage_url_domain" + """ + return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None + + +def url_basename(url): + path = urllib.parse.urlparse(url).path + return path.strip('/').split('/')[-1] + + +def base_url(url): + return re.match(r'https?://[^?#]+/', url).group() + + +def urljoin(base, path): + if isinstance(path, bytes): + path = path.decode() + if not isinstance(path, str) or not path: + return None + if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): + return path + if isinstance(base, bytes): + base = base.decode() + if not isinstance(base, str) or not re.match( + r'^(?:https?:)?//', base): + return None + return urllib.parse.urljoin(base, path) + + +class HEADRequest(urllib.request.Request): + def get_method(self): + return 'HEAD' + + +class PUTRequest(urllib.request.Request): + def get_method(self): + return 'PUT' + + +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): + if get_attr and v is not None: + v = getattr(v, get_attr, None) + try: + return int(v) * invscale // scale + except (ValueError, TypeError, OverflowError): + return default + + +def str_or_none(v, default=None): + return default if v is None else str(v) + + +def str_to_int(int_str): + """ A more relaxed version of int_or_none """ + if isinstance(int_str, int): + return int_str + elif isinstance(int_str, str): + int_str = re.sub(r'[,\.\+]', '', int_str) + return int_or_none(int_str) + + +def float_or_none(v, scale=1, invscale=1, default=None): + if v is None: + return default + try: + return float(v) * invscale / scale + except (ValueError, TypeError): + return default + + +def bool_or_none(v, default=None): + return v if isinstance(v, bool) else default + + +def strip_or_none(v, default=None): + return v.strip() if isinstance(v, str) else default + + +def url_or_none(url): + if not url or not isinstance(url, str): + return None + url = url.strip() + return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + + +def request_to_url(req): + if isinstance(req, urllib.request.Request): + return req.get_full_url() + else: + return req + + +def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): + datetime_object = None + try: + if isinstance(timestamp, (int, float)): # unix timestamp + # Using naive datetime here can break timestamp() in Windows + # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414 + # Also, datetime.datetime.fromtimestamp breaks for negative timestamps + # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642 + datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc) + + datetime.timedelta(seconds=timestamp)) + elif isinstance(timestamp, str): # assume YYYYMMDD + datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') + date_format = re.sub( # Support %s on windows + r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format) + return datetime_object.strftime(date_format) + except (ValueError, TypeError, AttributeError): + return default + + +def parse_duration(s): + if not isinstance(s, str): + return None + s = s.strip() + if not s: + return None + + days, hours, mins, secs, ms = [None] * 5 + m = re.match(r'''(?x) + (?P<before_secs> + (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)? + (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+)) + (?P<ms>[.:][0-9]+)?Z?$ + ''', s) + if m: + days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms') + else: + m = re.match( + r'''(?ix)(?:P? + (?: + [0-9]+\s*y(?:ears?)?,?\s* + )? + (?: + [0-9]+\s*m(?:onths?)?,?\s* + )? + (?: + [0-9]+\s*w(?:eeks?)?,?\s* + )? + (?: + (?P<days>[0-9]+)\s*d(?:ays?)?,?\s* + )? + T)? + (?: + (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s* + )? + (?: + (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s* + )? + (?: + (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* + )?Z?$''', s) + if m: + days, hours, mins, secs, ms = m.groups() + else: + m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) + if m: + hours, mins = m.groups() + else: + return None + + if ms: + ms = ms.replace(':', '.') + return sum(float(part or 0) * mult for part, mult in ( + (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1))) + + +def prepend_extension(filename, ext, expected_real_ext=None): + name, real_ext = os.path.splitext(filename) + return ( + f'{name}.{ext}{real_ext}' + if not expected_real_ext or real_ext[1:] == expected_real_ext + else f'{filename}.{ext}') + + +def replace_extension(filename, ext, expected_real_ext=None): + name, real_ext = os.path.splitext(filename) + return '{}.{}'.format( + name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, + ext) + + +def check_executable(exe, args=[]): + """ Checks if the given binary is installed somewhere in PATH, and returns its name. + args can be a list of arguments for a short output (like -version) """ + try: + Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except OSError: + return False + return exe + + +def _get_exe_version_output(exe, args): + try: + # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers + # SIGTTOU if yt-dlp is run in the background. + # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 + stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + if ret: + return None + except OSError: + return False + return stdout + + +def detect_exe_version(output, version_re=None, unrecognized='present'): + assert isinstance(output, str) + if version_re is None: + version_re = r'version\s+([-0-9._a-zA-Z]+)' + m = re.search(version_re, output) + if m: + return m.group(1) + else: + return unrecognized + + +def get_exe_version(exe, args=['--version'], + version_re=None, unrecognized=('present', 'broken')): + """ Returns the version of the specified executable, + or False if the executable is not present """ + unrecognized = variadic(unrecognized) + assert len(unrecognized) in (1, 2) + out = _get_exe_version_output(exe, args) + if out is None: + return unrecognized[-1] + return out and detect_exe_version(out, version_re, unrecognized[0]) + + +def frange(start=0, stop=None, step=1): + """Float range""" + if stop is None: + start, stop = 0, start + sign = [-1, 1][step > 0] if step else 0 + while sign * start < sign * stop: + yield start + start += step + + +class LazyList(collections.abc.Sequence): + """Lazy immutable list from an iterable + Note that slices of a LazyList are lists and not LazyList""" + + class IndexError(IndexError): + pass + + def __init__(self, iterable, *, reverse=False, _cache=None): + self._iterable = iter(iterable) + self._cache = [] if _cache is None else _cache + self._reversed = reverse + + def __iter__(self): + if self._reversed: + # We need to consume the entire iterable to iterate in reverse + yield from self.exhaust() + return + yield from self._cache + for item in self._iterable: + self._cache.append(item) + yield item + + def _exhaust(self): + self._cache.extend(self._iterable) + self._iterable = [] # Discard the emptied iterable to make it pickle-able + return self._cache + + def exhaust(self): + """Evaluate the entire iterable""" + return self._exhaust()[::-1 if self._reversed else 1] + + @staticmethod + def _reverse_index(x): + return None if x is None else ~x + + def __getitem__(self, idx): + if isinstance(idx, slice): + if self._reversed: + idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1)) + start, stop, step = idx.start, idx.stop, idx.step or 1 + elif isinstance(idx, int): + if self._reversed: + idx = self._reverse_index(idx) + start, stop, step = idx, idx, 0 + else: + raise TypeError('indices must be integers or slices') + if ((start or 0) < 0 or (stop or 0) < 0 + or (start is None and step < 0) + or (stop is None and step > 0)): + # We need to consume the entire iterable to be able to slice from the end + # Obviously, never use this with infinite iterables + self._exhaust() + try: + return self._cache[idx] + except IndexError as e: + raise self.IndexError(e) from e + n = max(start or 0, stop or 0) - len(self._cache) + 1 + if n > 0: + self._cache.extend(itertools.islice(self._iterable, n)) + try: + return self._cache[idx] + except IndexError as e: + raise self.IndexError(e) from e + + def __bool__(self): + try: + self[-1] if self._reversed else self[0] + except self.IndexError: + return False + return True + + def __len__(self): + self._exhaust() + return len(self._cache) + + def __reversed__(self): + return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache) + + def __copy__(self): + return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache) + + def __repr__(self): + # repr and str should mimic a list. So we exhaust the iterable + return repr(self.exhaust()) + + def __str__(self): + return repr(self.exhaust()) + + +class PagedList: + + class IndexError(IndexError): + pass + + def __len__(self): + # This is only useful for tests + return len(self.getslice()) + + def __init__(self, pagefunc, pagesize, use_cache=True): + self._pagefunc = pagefunc + self._pagesize = pagesize + self._pagecount = float('inf') + self._use_cache = use_cache + self._cache = {} + + def getpage(self, pagenum): + page_results = self._cache.get(pagenum) + if page_results is None: + page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum)) + if self._use_cache: + self._cache[pagenum] = page_results + return page_results + + def getslice(self, start=0, end=None): + return list(self._getslice(start, end)) + + def _getslice(self, start, end): + raise NotImplementedError('This method must be implemented by subclasses') + + def __getitem__(self, idx): + assert self._use_cache, 'Indexing PagedList requires cache' + if not isinstance(idx, int) or idx < 0: + raise TypeError('indices must be non-negative integers') + entries = self.getslice(idx, idx + 1) + if not entries: + raise self.IndexError() + return entries[0] + + +class OnDemandPagedList(PagedList): + """Download pages until a page with less than maximum results""" + + def _getslice(self, start, end): + for pagenum in itertools.count(start // self._pagesize): + firstid = pagenum * self._pagesize + nextfirstid = pagenum * self._pagesize + self._pagesize + if start >= nextfirstid: + continue + + startv = ( + start % self._pagesize + if firstid <= start < nextfirstid + else 0) + endv = ( + ((end - 1) % self._pagesize) + 1 + if (end is not None and firstid <= end <= nextfirstid) + else None) + + try: + page_results = self.getpage(pagenum) + except Exception: + self._pagecount = pagenum - 1 + raise + if startv != 0 or endv is not None: + page_results = page_results[startv:endv] + yield from page_results + + # A little optimization - if current page is not "full", ie. does + # not contain page_size videos then we can assume that this page + # is the last one - there are no more ids on further pages - + # i.e. no need to query again. + if len(page_results) + startv < self._pagesize: + break + + # If we got the whole page, but the next page is not interesting, + # break out early as well + if end == nextfirstid: + break + + +class InAdvancePagedList(PagedList): + """PagedList with total number of pages known in advance""" + + def __init__(self, pagefunc, pagecount, pagesize): + PagedList.__init__(self, pagefunc, pagesize, True) + self._pagecount = pagecount + + def _getslice(self, start, end): + start_page = start // self._pagesize + end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1) + skip_elems = start - start_page * self._pagesize + only_more = None if end is None else end - start + for pagenum in range(start_page, end_page): + page_results = self.getpage(pagenum) + if skip_elems: + page_results = page_results[skip_elems:] + skip_elems = None + if only_more is not None: + if len(page_results) < only_more: + only_more -= len(page_results) + else: + yield from page_results[:only_more] + break + yield from page_results + + +class PlaylistEntries: + MissingEntry = object() + is_exhausted = False + + def __init__(self, ydl, info_dict): + self.ydl = ydl + + # _entries must be assigned now since infodict can change during iteration + entries = info_dict.get('entries') + if entries is None: + raise EntryNotInPlaylist('There are no entries') + elif isinstance(entries, list): + self.is_exhausted = True + + requested_entries = info_dict.get('requested_entries') + self.is_incomplete = requested_entries is not None + if self.is_incomplete: + assert self.is_exhausted + self._entries = [self.MissingEntry] * max(requested_entries or [0]) + for i, entry in zip(requested_entries, entries): + self._entries[i - 1] = entry + elif isinstance(entries, (list, PagedList, LazyList)): + self._entries = entries + else: + self._entries = LazyList(entries) + + PLAYLIST_ITEMS_RE = re.compile(r'''(?x) + (?P<start>[+-]?\d+)? + (?P<range>[:-] + (?P<end>[+-]?\d+|inf(?:inite)?)? + (?::(?P<step>[+-]?\d+))? + )?''') + + @classmethod + def parse_playlist_items(cls, string): + for segment in string.split(','): + if not segment: + raise ValueError('There is two or more consecutive commas') + mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment) + if not mobj: + raise ValueError(f'{segment!r} is not a valid specification') + start, end, step, has_range = mobj.group('start', 'end', 'step', 'range') + if int_or_none(step) == 0: + raise ValueError(f'Step in {segment!r} cannot be zero') + yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start) + + def get_requested_items(self): + playlist_items = self.ydl.params.get('playlist_items') + playlist_start = self.ydl.params.get('playliststart', 1) + playlist_end = self.ydl.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlist_end in (-1, None): + playlist_end = '' + if not playlist_items: + playlist_items = f'{playlist_start}:{playlist_end}' + elif playlist_start != 1 or playlist_end: + self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True) + + for index in self.parse_playlist_items(playlist_items): + for i, entry in self[index]: + yield i, entry + if not entry: + continue + try: + # The item may have just been added to archive. Don't break due to it + if not self.ydl.params.get('lazy_playlist'): + # TODO: Add auto-generated fields + self.ydl._match_entry(entry, incomplete=True, silent=True) + except (ExistingVideoReached, RejectedVideoReached): + return + + def get_full_count(self): + if self.is_exhausted and not self.is_incomplete: + return len(self) + elif isinstance(self._entries, InAdvancePagedList): + if self._entries._pagesize == 1: + return self._entries._pagecount + + @functools.cached_property + def _getter(self): + if isinstance(self._entries, list): + def get_entry(i): + try: + entry = self._entries[i] + except IndexError: + entry = self.MissingEntry + if not self.is_incomplete: + raise self.IndexError() + if entry is self.MissingEntry: + raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found') + return entry + else: + def get_entry(i): + try: + return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i) + except (LazyList.IndexError, PagedList.IndexError): + raise self.IndexError() + return get_entry + + def __getitem__(self, idx): + if isinstance(idx, int): + idx = slice(idx, idx) + + # NB: PlaylistEntries[1:10] => (0, 1, ... 9) + step = 1 if idx.step is None else idx.step + if idx.start is None: + start = 0 if step > 0 else len(self) - 1 + else: + start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start + + # NB: Do not call len(self) when idx == [:] + if idx.stop is None: + stop = 0 if step < 0 else float('inf') + else: + stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop + stop += [-1, 1][step > 0] + + for i in frange(start, stop, step): + if i < 0: + continue + try: + entry = self._getter(i) + except self.IndexError: + self.is_exhausted = True + if step > 0: + break + continue + yield i + 1, entry + + def __len__(self): + return len(tuple(self[:])) + + class IndexError(IndexError): + pass + + +def uppercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\U[0-9a-fA-F]{8}', + lambda m: unicode_escape(m.group(0))[0], + s) + + +def lowercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\u[0-9a-fA-F]{4}', + lambda m: unicode_escape(m.group(0))[0], + s) + + +def escape_rfc3986(s): + """Escape non-ASCII characters as suggested by RFC 3986""" + return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + + +def escape_url(url): + """Escape URL as suggested by RFC 3986""" + url_parsed = urllib.parse.urlparse(url) + return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), + path=escape_rfc3986(url_parsed.path), + params=escape_rfc3986(url_parsed.params), + query=escape_rfc3986(url_parsed.query), + fragment=escape_rfc3986(url_parsed.fragment) + ).geturl() + + +def parse_qs(url, **kwargs): + return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs) + + +def read_batch_urls(batch_fd): + def fixup(url): + if not isinstance(url, str): + url = url.decode('utf-8', 'replace') + BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff') + for bom in BOM_UTF8: + if url.startswith(bom): + url = url[len(bom):] + url = url.lstrip() + if not url or url.startswith(('#', ';', ']')): + return False + # "#" cannot be stripped out since it is part of the URI + # However, it can be safely stripped out if following a whitespace + return re.split(r'\s#', url, 1)[0].rstrip() + + with contextlib.closing(batch_fd) as fd: + return [url for url in map(fixup, fd) if url] + + +def urlencode_postdata(*args, **kargs): + return urllib.parse.urlencode(*args, **kargs).encode('ascii') + + +def update_url(url, *, query_update=None, **kwargs): + """Replace URL components specified by kwargs + @param url str or parse url tuple + @param query_update update query + @returns str + """ + if isinstance(url, str): + if not kwargs and not query_update: + return url + else: + url = urllib.parse.urlparse(url) + if query_update: + assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time' + kwargs['query'] = urllib.parse.urlencode({ + **urllib.parse.parse_qs(url.query), + **query_update + }, True) + return urllib.parse.urlunparse(url._replace(**kwargs)) + + +def update_url_query(url, query): + return update_url(url, query_update=query) + + +def update_Request(req, url=None, data=None, headers=None, query=None): + req_headers = req.headers.copy() + req_headers.update(headers or {}) + req_data = data or req.data + req_url = update_url_query(url or req.get_full_url(), query) + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = urllib.request.Request + new_req = req_type( + req_url, data=req_data, headers=req_headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + if hasattr(req, 'timeout'): + new_req.timeout = req.timeout + return new_req + + +def _multipart_encode_impl(data, boundary): + content_type = 'multipart/form-data; boundary=%s' % boundary + + out = b'' + for k, v in data.items(): + out += b'--' + boundary.encode('ascii') + b'\r\n' + if isinstance(k, str): + k = k.encode() + if isinstance(v, str): + v = v.encode() + # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 + # suggests sending UTF-8 directly. Firefox sends UTF-8, too + content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' + if boundary.encode('ascii') in content: + raise ValueError('Boundary overlaps with data') + out += content + + out += b'--' + boundary.encode('ascii') + b'--\r\n' + + return out, content_type + + +def multipart_encode(data, boundary=None): + ''' + Encode a dict to RFC 7578-compliant form-data + + data: + A dict where keys and values can be either Unicode or bytes-like + objects. + boundary: + If specified a Unicode object, it's used as the boundary. Otherwise + a random boundary is generated. + + Reference: https://tools.ietf.org/html/rfc7578 + ''' + has_specified_boundary = boundary is not None + + while True: + if boundary is None: + boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) + + try: + out, content_type = _multipart_encode_impl(data, boundary) + break + except ValueError: + if has_specified_boundary: + raise + boundary = None + + return out, content_type + + +def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT): + if blocked_types is NO_DEFAULT: + blocked_types = (str, bytes, collections.abc.Mapping) + return isinstance(x, allowed_types) and not isinstance(x, blocked_types) + + +def variadic(x, allowed_types=NO_DEFAULT): + if not isinstance(allowed_types, (tuple, type)): + deprecation_warning('allowed_types should be a tuple or a type') + allowed_types = tuple(allowed_types) + return x if is_iterable_like(x, blocked_types=allowed_types) else (x, ) + + +def try_call(*funcs, expected_type=None, args=[], kwargs={}): + for f in funcs: + try: + val = f(*args, **kwargs) + except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError): + pass + else: + if expected_type is None or isinstance(val, expected_type): + return val + + +def try_get(src, getter, expected_type=None): + return try_call(*variadic(getter), args=(src,), expected_type=expected_type) + + +def filter_dict(dct, cndn=lambda _, v: v is not None): + return {k: v for k, v in dct.items() if cndn(k, v)} + + +def merge_dicts(*dicts): + merged = {} + for a_dict in dicts: + for k, v in a_dict.items(): + if (v is not None and k not in merged + or isinstance(v, str) and merged[k] == ''): + merged[k] = v + return merged + + +def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): + return string if isinstance(string, str) else str(string, encoding, errors) + + +US_RATINGS = { + 'G': 0, + 'PG': 10, + 'PG-13': 13, + 'R': 16, + 'NC': 18, +} + + +TV_PARENTAL_GUIDELINES = { + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, +} + + +def parse_age_limit(s): + # isinstance(False, int) is True. So type() must be used instead + if type(s) is int: # noqa: E721 + return s if 0 <= s <= 21 else None + elif not isinstance(s, str): + return None + m = re.match(r'^(?P<age>\d{1,2})\+?$', s) + if m: + return int(m.group('age')) + s = s.upper() + if s in US_RATINGS: + return US_RATINGS[s] + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) + if m: + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] + return None + + +def strip_jsonp(code): + return re.sub( + r'''(?sx)^ + (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) + (?:\s*&&\s*(?P=func_name))? + \s*\(\s*(?P<callback_data>.*)\);? + \s*?(?://[^\n]*)*$''', + r'\g<callback_data>', code) + + +def js_to_json(code, vars={}, *, strict=False): + # vars is a dict of var, val pairs to substitute + STRING_QUOTES = '\'"`' + STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES) + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' + SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*' + INTEGER_TABLE = ( + (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16), + (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8), + ) + + def process_escape(match): + JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu' + escape = match.group(1) or match.group(2) + + return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES + else R'\u00' if escape == 'x' + else '' if escape == '\n' + else escape) + + def template_substitute(match): + evaluated = js_to_json(match.group(1), vars, strict=strict) + if evaluated[0] == '"': + return json.loads(evaluated) + return evaluated + + def fix_kv(m): + v = m.group(0) + if v in ('true', 'false', 'null'): + return v + elif v in ('undefined', 'void 0'): + return 'null' + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': + return '' + + if v[0] in STRING_QUOTES: + v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1] + escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v) + return f'"{escaped}"' + + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return f'"{i}":' if v.endswith(':') else str(i) + + if v in vars: + try: + if not strict: + json.loads(vars[v]) + except json.JSONDecodeError: + return json.dumps(vars[v]) + else: + return vars[v] + + if not strict: + return f'"{v}"' + + raise ValueError(f'Unknown value: {v}') + + def create_map(mobj): + return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) + + code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) + if not strict: + code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) + code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code) + code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code) + + return re.sub(rf'''(?sx) + {STRING_RE}| + {COMMENT_RE}|,(?={SKIP_RE}[\]}}])| + void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*| + \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?| + [0-9]+(?={SKIP_RE}:)| + !+ + ''', fix_kv, code) + + +def qualities(quality_ids): + """ Get a numeric quality value out of a list of possible values """ + def q(qid): + try: + return quality_ids.index(qid) + except ValueError: + return -1 + return q + + +POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist') + + +DEFAULT_OUTTMPL = { + 'default': '%(title)s [%(id)s].%(ext)s', + 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s', +} +OUTTMPL_TYPES = { + 'chapter': None, + 'subtitle': None, + 'thumbnail': None, + 'description': 'description', + 'annotation': 'annotations.xml', + 'infojson': 'info.json', + 'link': None, + 'pl_video': None, + 'pl_thumbnail': None, + 'pl_description': 'description', + 'pl_infojson': 'info.json', +} + +# As of [1] format syntax is: +# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type +# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting +STR_FORMAT_RE_TMPL = r'''(?x) + (?<!%)(?P<prefix>(?:%%)*) + % + (?P<has_key>\((?P<key>{0})\))? + (?P<format> + (?P<conversion>[#0\-+ ]+)? + (?P<min_width>\d+)? + (?P<precision>\.\d+)? + (?P<len_mod>[hlL])? # unused in python + {1} # conversion type + ) +''' + + +STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa' + + +def limit_length(s, length): + """ Add ellipses to overly long strings """ + if s is None: + return None + ELLIPSES = '...' + if len(s) > length: + return s[:length - len(ELLIPSES)] + ELLIPSES + return s + + +def version_tuple(v): + return tuple(int(e) for e in re.split(r'[-.]', v)) + + +def is_outdated_version(version, limit, assume_new=True): + if not version: + return not assume_new + try: + return version_tuple(version) < version_tuple(limit) + except ValueError: + return not assume_new + + +def ytdl_is_updateable(): + """ Returns if yt-dlp can be updated with -U """ + + from ..update import is_non_updateable + + return not is_non_updateable() + + +def args_to_str(args): + # Get a short string representation for a subprocess command + return ' '.join(compat_shlex_quote(a) for a in args) + + +def error_to_str(err): + return f'{type(err).__name__}: {err}' + + +def mimetype2ext(mt, default=NO_DEFAULT): + if not isinstance(mt, str): + if default is not NO_DEFAULT: + return default + return None + + MAP = { + # video + '3gpp': '3gp', + 'mp2t': 'ts', + 'mp4': 'mp4', + 'mpeg': 'mpeg', + 'mpegurl': 'm3u8', + 'quicktime': 'mov', + 'webm': 'webm', + 'vp9': 'vp9', + 'x-flv': 'flv', + 'x-m4v': 'm4v', + 'x-matroska': 'mkv', + 'x-mng': 'mng', + 'x-mp4-fragmented': 'mp4', + 'x-ms-asf': 'asf', + 'x-ms-wmv': 'wmv', + 'x-msvideo': 'avi', + + # application (streaming playlists) + 'dash+xml': 'mpd', + 'f4m+xml': 'f4m', + 'hds+xml': 'f4m', + 'vnd.apple.mpegurl': 'm3u8', + 'vnd.ms-sstr+xml': 'ism', + 'x-mpegurl': 'm3u8', + + # audio + 'audio/mp4': 'm4a', + # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. + # Using .mp3 as it's the most popular one + 'audio/mpeg': 'mp3', + 'audio/webm': 'webm', + 'audio/x-matroska': 'mka', + 'audio/x-mpegurl': 'm3u', + 'midi': 'mid', + 'ogg': 'ogg', + 'wav': 'wav', + 'wave': 'wav', + 'x-aac': 'aac', + 'x-flac': 'flac', + 'x-m4a': 'm4a', + 'x-realaudio': 'ra', + 'x-wav': 'wav', + + # image + 'avif': 'avif', + 'bmp': 'bmp', + 'gif': 'gif', + 'jpeg': 'jpg', + 'png': 'png', + 'svg+xml': 'svg', + 'tiff': 'tif', + 'vnd.wap.wbmp': 'wbmp', + 'webp': 'webp', + 'x-icon': 'ico', + 'x-jng': 'jng', + 'x-ms-bmp': 'bmp', + + # caption + 'filmstrip+json': 'fs', + 'smptett+xml': 'tt', + 'ttaf+xml': 'dfxp', + 'ttml+xml': 'ttml', + 'x-ms-sami': 'sami', + + # misc + 'gzip': 'gz', + 'json': 'json', + 'xml': 'xml', + 'zip': 'zip', + } + + mimetype = mt.partition(';')[0].strip().lower() + _, _, subtype = mimetype.rpartition('/') + + ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) + if ext: + return ext + elif default is not NO_DEFAULT: + return default + return subtype.replace('+', '.') + + +def ext2mimetype(ext_or_url): + if not ext_or_url: + return None + if '.' not in ext_or_url: + ext_or_url = f'file.{ext_or_url}' + return mimetypes.guess_type(ext_or_url)[0] + + +def parse_codecs(codecs_str): + # http://tools.ietf.org/html/rfc6381 + if not codecs_str: + return {} + split_codecs = list(filter(None, map( + str.strip, codecs_str.strip().strip(',').split(',')))) + vcodec, acodec, scodec, hdr = None, None, None, None + for full_codec in split_codecs: + parts = re.sub(r'0+(?=\d)', '', full_codec).split('.') + if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', + 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'): + if vcodec: + continue + vcodec = full_codec + if parts[0] in ('dvh1', 'dvhe'): + hdr = 'DV' + elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10': + hdr = 'HDR10' + elif parts[:2] == ['vp9', '2']: + hdr = 'HDR10' + elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4', + 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): + acodec = acodec or full_codec + elif parts[0] in ('stpp', 'wvtt'): + scodec = scodec or full_codec + else: + write_string(f'WARNING: Unknown codec {full_codec}\n') + if vcodec or acodec or scodec: + return { + 'vcodec': vcodec or 'none', + 'acodec': acodec or 'none', + 'dynamic_range': hdr, + **({'scodec': scodec} if scodec is not None else {}), + } + elif len(split_codecs) == 2: + return { + 'vcodec': split_codecs[0], + 'acodec': split_codecs[1], + } + return {} + + +def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): + assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts) + + allow_mkv = not preferences or 'mkv' in preferences + + if allow_mkv and max(len(acodecs), len(vcodecs)) > 1: + return 'mkv' # TODO: any other format allows this? + + # TODO: All codecs supported by parse_codecs isn't handled here + COMPATIBLE_CODECS = { + 'mp4': { + 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd) + 'h264', 'aacl', 'ec-3', # Set in ISM + }, + 'webm': { + 'av1', 'vp9', 'vp8', 'opus', 'vrbs', + 'vp9x', 'vp8x', # in the webm spec + }, + } + + sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', '')) + vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs) + + for ext in preferences or COMPATIBLE_CODECS.keys(): + codec_set = COMPATIBLE_CODECS.get(ext, set()) + if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)): + return ext + + COMPATIBLE_EXTS = ( + {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'}, + {'webm', 'weba'}, + ) + for ext in preferences or vexts: + current_exts = {ext, *vexts, *aexts} + if ext == 'mkv' or current_exts == {ext} or any( + ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS): + return ext + return 'mkv' if allow_mkv else preferences[-1] + + +def urlhandle_detect_ext(url_handle, default=NO_DEFAULT): + getheader = url_handle.headers.get + + cd = getheader('Content-Disposition') + if cd: + m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd) + if m: + e = determine_ext(m.group('filename'), default_ext=None) + if e: + return e + + meta_ext = getheader('x-amz-meta-name') + if meta_ext: + e = meta_ext.rpartition('.')[2] + if e: + return e + + return mimetype2ext(getheader('Content-Type'), default=default) + + +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + +def age_restricted(content_limit, age_limit): + """ Returns True iff the content should be blocked """ + + if age_limit is None: # No limit set + return False + if content_limit is None: + return False # Content available for everyone + return age_limit < content_limit + + +# List of known byte-order-marks (BOM) +BOMS = [ + (b'\xef\xbb\xbf', 'utf-8'), + (b'\x00\x00\xfe\xff', 'utf-32-be'), + (b'\xff\xfe\x00\x00', 'utf-32-le'), + (b'\xff\xfe', 'utf-16-le'), + (b'\xfe\xff', 'utf-16-be'), +] + + +def is_html(first_bytes): + """ Detect whether a file contains HTML by examining its first bytes. """ + + encoding = 'utf-8' + for bom, enc in BOMS: + while first_bytes.startswith(bom): + encoding, first_bytes = enc, first_bytes[len(bom):] + + return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) + + +def determine_protocol(info_dict): + protocol = info_dict.get('protocol') + if protocol is not None: + return protocol + + url = sanitize_url(info_dict['url']) + if url.startswith('rtmp'): + return 'rtmp' + elif url.startswith('mms'): + return 'mms' + elif url.startswith('rtsp'): + return 'rtsp' + + ext = determine_ext(url) + if ext == 'm3u8': + return 'm3u8' if info_dict.get('is_live') else 'm3u8_native' + elif ext == 'f4m': + return 'f4m' + + return urllib.parse.urlparse(url).scheme + + +def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False): + """ Render a list of rows, each as a list of values. + Text after a \t will be right aligned """ + def width(string): + return len(remove_terminal_sequences(string).replace('\t', '')) + + def get_max_lens(table): + return [max(width(str(v)) for v in col) for col in zip(*table)] + + def filter_using_list(row, filterArray): + return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take] + + max_lens = get_max_lens(data) if hide_empty else [] + header_row = filter_using_list(header_row, max_lens) + data = [filter_using_list(row, max_lens) for row in data] + + table = [header_row] + data + max_lens = get_max_lens(table) + extra_gap += 1 + if delim: + table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data + table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter + for row in table: + for pos, text in enumerate(map(str, row)): + if '\t' in text: + row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap + else: + row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap) + ret = '\n'.join(''.join(row).rstrip() for row in table) + return ret + + +def _match_one(filter_part, dct, incomplete): + # TODO: Generalize code with YoutubeDL._build_format_filter + STRING_OPERATORS = { + '*=': operator.contains, + '^=': lambda attr, value: attr.startswith(value), + '$=': lambda attr, value: attr.endswith(value), + '~=': lambda attr, value: re.search(value, attr), + } + COMPARISON_OPERATORS = { + **STRING_OPERATORS, + '<=': operator.le, # "<=" must be defined above "<" + '<': operator.lt, + '>=': operator.ge, + '>': operator.gt, + '=': operator.eq, + } + + if isinstance(incomplete, bool): + is_incomplete = lambda _: incomplete + else: + is_incomplete = lambda k: k in incomplete + + operator_rex = re.compile(r'''(?x) + (?P<key>[a-z_]+) + \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* + (?: + (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)| + (?P<strval>.+?) + ) + ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) + m = operator_rex.fullmatch(filter_part.strip()) + if m: + m = m.groupdict() + unnegated_op = COMPARISON_OPERATORS[m['op']] + if m['negation']: + op = lambda attr, value: not unnegated_op(attr, value) + else: + op = unnegated_op + comparison_value = m['quotedstrval'] or m['strval'] or m['intval'] + if m['quote']: + comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote']) + actual_value = dct.get(m['key']) + numeric_comparison = None + if isinstance(actual_value, (int, float)): + # If the original field is a string and matching comparisonvalue is + # a number we should respect the origin of the original field + # and process comparison value as a string (see + # https://github.com/ytdl-org/youtube-dl/issues/11082) + try: + numeric_comparison = int(comparison_value) + except ValueError: + numeric_comparison = parse_filesize(comparison_value) + if numeric_comparison is None: + numeric_comparison = parse_filesize(f'{comparison_value}B') + if numeric_comparison is None: + numeric_comparison = parse_duration(comparison_value) + if numeric_comparison is not None and m['op'] in STRING_OPERATORS: + raise ValueError('Operator %s only supports string values!' % m['op']) + if actual_value is None: + return is_incomplete(m['key']) or m['none_inclusive'] + return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison) + + UNARY_OPERATORS = { + '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), + '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), + } + operator_rex = re.compile(r'''(?x) + (?P<op>%s)\s*(?P<key>[a-z_]+) + ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) + m = operator_rex.fullmatch(filter_part.strip()) + if m: + op = UNARY_OPERATORS[m.group('op')] + actual_value = dct.get(m.group('key')) + if is_incomplete(m.group('key')) and actual_value is None: + return True + return op(actual_value) + + raise ValueError('Invalid filter part %r' % filter_part) + + +def match_str(filter_str, dct, incomplete=False): + """ Filter a dictionary with a simple string syntax. + @returns Whether the filter passes + @param incomplete Set of keys that is expected to be missing from dct. + Can be True/False to indicate all/none of the keys may be missing. + All conditions on incomplete keys pass if the key is missing + """ + return all( + _match_one(filter_part.replace(r'\&', '&'), dct, incomplete) + for filter_part in re.split(r'(?<!\\)&', filter_str)) + + +def match_filter_func(filters, breaking_filters=None): + if not filters and not breaking_filters: + return None + breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None) + filters = set(variadic(filters or [])) + + interactive = '-' in filters + if interactive: + filters.remove('-') + + def _match_func(info_dict, incomplete=False): + ret = breaking_filters(info_dict, incomplete) + if ret is not None: + raise RejectedVideoReached(ret) + + if not filters or any(match_str(f, info_dict, incomplete) for f in filters): + return NO_DEFAULT if interactive and not incomplete else None + else: + video_title = info_dict.get('title') or info_dict.get('id') or 'entry' + filter_str = ') | ('.join(map(str.strip, filters)) + return f'{video_title} does not pass filter ({filter_str}), skipping ..' + return _match_func + + +class download_range_func: + def __init__(self, chapters, ranges): + self.chapters, self.ranges = chapters, ranges + + def __call__(self, info_dict, ydl): + if not self.ranges and not self.chapters: + yield {} + + warning = ('There are no chapters matching the regex' if info_dict.get('chapters') + else 'Cannot match chapters since chapter information is unavailable') + for regex in self.chapters or []: + for i, chapter in enumerate(info_dict.get('chapters') or []): + if re.search(regex, chapter['title']): + warning = None + yield {**chapter, 'index': i} + if self.chapters and warning: + ydl.to_screen(f'[info] {info_dict["id"]}: {warning}') + + yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or []) + + def __eq__(self, other): + return (isinstance(other, download_range_func) + and self.chapters == other.chapters and self.ranges == other.ranges) + + def __repr__(self): + return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})' + + +def parse_dfxp_time_expr(time_expr): + if not time_expr: + return + + mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr) + if mobj: + return float(mobj.group('time_offset')) + + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr) + if mobj: + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.')) + + +def srt_subtitles_timecode(seconds): + return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000) + + +def ass_subtitles_timecode(seconds): + time = timetuple_from_msec(seconds * 1000) + return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10) + + +def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' + LEGACY_NAMESPACES = ( + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', + ]), + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', + ]), + ) + + SUPPORTED_STYLING = [ + 'color', + 'fontFamily', + 'fontSize', + 'fontStyle', + 'fontWeight', + 'textDecoration' + ] + + _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', + 'ttml': 'http://www.w3.org/ns/ttml', + 'tts': 'http://www.w3.org/ns/ttml#styling', + }) + + styles = {} + default_style = {} + + class TTMLPElementParser: + _out = '' + _unclosed_elements = [] + _applied_styles = [] + + def start(self, tag, attrib): + if tag in (_x('ttml:br'), 'br'): + self._out += '\n' + else: + unclosed_elements = [] + style = {} + element_style_id = attrib.get('style') + if default_style: + style.update(default_style) + if element_style_id: + style.update(styles.get(element_style_id, {})) + for prop in SUPPORTED_STYLING: + prop_val = attrib.get(_x('tts:' + prop)) + if prop_val: + style[prop] = prop_val + if style: + font = '' + for k, v in sorted(style.items()): + if self._applied_styles and self._applied_styles[-1].get(k) == v: + continue + if k == 'color': + font += ' color="%s"' % v + elif k == 'fontSize': + font += ' size="%s"' % v + elif k == 'fontFamily': + font += ' face="%s"' % v + elif k == 'fontWeight' and v == 'bold': + self._out += '<b>' + unclosed_elements.append('b') + elif k == 'fontStyle' and v == 'italic': + self._out += '<i>' + unclosed_elements.append('i') + elif k == 'textDecoration' and v == 'underline': + self._out += '<u>' + unclosed_elements.append('u') + if font: + self._out += '<font' + font + '>' + unclosed_elements.append('font') + applied_style = {} + if self._applied_styles: + applied_style.update(self._applied_styles[-1]) + applied_style.update(style) + self._applied_styles.append(applied_style) + self._unclosed_elements.append(unclosed_elements) + + def end(self, tag): + if tag not in (_x('ttml:br'), 'br'): + unclosed_elements = self._unclosed_elements.pop() + for element in reversed(unclosed_elements): + self._out += '</%s>' % element + if unclosed_elements and self._applied_styles: + self._applied_styles.pop() + + def data(self, data): + self._out += data + + def close(self): + return self._out.strip() + + # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870 + # This will not trigger false positives since only UTF-8 text is being replaced + dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'') + + def parse_node(node): + target = TTMLPElementParser() + parser = xml.etree.ElementTree.XMLParser(target=target) + parser.feed(xml.etree.ElementTree.tostring(node)) + return parser.close() + + for k, v in LEGACY_NAMESPACES: + for ns in v: + dfxp_data = dfxp_data.replace(ns, k) + + dfxp = compat_etree_fromstring(dfxp_data) + out = [] + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') + + if not paras: + raise ValueError('Invalid dfxp/TTML subtitle') + + repeat = False + while True: + for style in dfxp.findall(_x('.//ttml:style')): + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue + parent_style_id = style.get('style') + if parent_style_id: + if parent_style_id not in styles: + repeat = True + continue + styles[style_id] = styles[parent_style_id].copy() + for prop in SUPPORTED_STYLING: + prop_val = style.get(_x('tts:' + prop)) + if prop_val: + styles.setdefault(style_id, {})[prop] = prop_val + if repeat: + repeat = False + else: + break + + for p in ('body', 'div'): + ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) + if ele is None: + continue + style = styles.get(ele.get('style')) + if not style: + continue + default_style.update(style) + + for para, index in zip(paras, itertools.count(1)): + begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) + end_time = parse_dfxp_time_expr(para.attrib.get('end')) + dur = parse_dfxp_time_expr(para.attrib.get('dur')) + if begin_time is None: + continue + if not end_time: + if not dur: + continue + end_time = begin_time + dur + out.append('%d\n%s --> %s\n%s\n\n' % ( + index, + srt_subtitles_timecode(begin_time), + srt_subtitles_timecode(end_time), + parse_node(para))) + + return ''.join(out) + + +def cli_option(params, command_option, param, separator=None): + param = params.get(param) + return ([] if param is None + else [command_option, str(param)] if separator is None + else [f'{command_option}{separator}{param}']) + + +def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): + param = params.get(param) + assert param in (True, False, None) + return cli_option({True: true_value, False: false_value}, command_option, param, separator) + + +def cli_valueless_option(params, command_option, param, expected_value=True): + return [command_option] if params.get(param) == expected_value else [] + + +def cli_configuration_args(argdict, keys, default=[], use_compat=True): + if isinstance(argdict, (list, tuple)): # for backward compatibility + if use_compat: + return argdict + else: + argdict = None + if argdict is None: + return default + assert isinstance(argdict, dict) + + assert isinstance(keys, (list, tuple)) + for key_list in keys: + arg_list = list(filter( + lambda x: x is not None, + [argdict.get(key.lower()) for key in variadic(key_list)])) + if arg_list: + return [arg for args in arg_list for arg in args] + return default + + +def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True): + main_key, exe = main_key.lower(), exe.lower() + root_key = exe if main_key == exe else f'{main_key}+{exe}' + keys = [f'{root_key}{k}' for k in (keys or [''])] + if root_key in keys: + if main_key != exe: + keys.append((main_key, exe)) + keys.append('default') + else: + use_compat = False + return cli_configuration_args(argdict, keys, default, use_compat) + + +class ISO639Utils: + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'iw': 'heb', # Replaced by he in 1989 revision + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'in': 'ind', # Replaced by id in 1989 revision + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pe': 'per', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'ji': 'yid', # Replaced by yi in 1989 revision + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + @classmethod + def short2long(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + @classmethod + def long2short(cls, code): + """Convert language code from ISO 639-2/T to ISO 639-1""" + for short_name, long_name in cls._lang_map.items(): + if long_name == code: + return short_name + + +class ISO3166Utils: + # From http://data.okfn.org/data/core/country-list + _country_map = { + 'AF': 'Afghanistan', + 'AX': 'Åland Islands', + 'AL': 'Albania', + 'DZ': 'Algeria', + 'AS': 'American Samoa', + 'AD': 'Andorra', + 'AO': 'Angola', + 'AI': 'Anguilla', + 'AQ': 'Antarctica', + 'AG': 'Antigua and Barbuda', + 'AR': 'Argentina', + 'AM': 'Armenia', + 'AW': 'Aruba', + 'AU': 'Australia', + 'AT': 'Austria', + 'AZ': 'Azerbaijan', + 'BS': 'Bahamas', + 'BH': 'Bahrain', + 'BD': 'Bangladesh', + 'BB': 'Barbados', + 'BY': 'Belarus', + 'BE': 'Belgium', + 'BZ': 'Belize', + 'BJ': 'Benin', + 'BM': 'Bermuda', + 'BT': 'Bhutan', + 'BO': 'Bolivia, Plurinational State of', + 'BQ': 'Bonaire, Sint Eustatius and Saba', + 'BA': 'Bosnia and Herzegovina', + 'BW': 'Botswana', + 'BV': 'Bouvet Island', + 'BR': 'Brazil', + 'IO': 'British Indian Ocean Territory', + 'BN': 'Brunei Darussalam', + 'BG': 'Bulgaria', + 'BF': 'Burkina Faso', + 'BI': 'Burundi', + 'KH': 'Cambodia', + 'CM': 'Cameroon', + 'CA': 'Canada', + 'CV': 'Cape Verde', + 'KY': 'Cayman Islands', + 'CF': 'Central African Republic', + 'TD': 'Chad', + 'CL': 'Chile', + 'CN': 'China', + 'CX': 'Christmas Island', + 'CC': 'Cocos (Keeling) Islands', + 'CO': 'Colombia', + 'KM': 'Comoros', + 'CG': 'Congo', + 'CD': 'Congo, the Democratic Republic of the', + 'CK': 'Cook Islands', + 'CR': 'Costa Rica', + 'CI': 'Côte d\'Ivoire', + 'HR': 'Croatia', + 'CU': 'Cuba', + 'CW': 'Curaçao', + 'CY': 'Cyprus', + 'CZ': 'Czech Republic', + 'DK': 'Denmark', + 'DJ': 'Djibouti', + 'DM': 'Dominica', + 'DO': 'Dominican Republic', + 'EC': 'Ecuador', + 'EG': 'Egypt', + 'SV': 'El Salvador', + 'GQ': 'Equatorial Guinea', + 'ER': 'Eritrea', + 'EE': 'Estonia', + 'ET': 'Ethiopia', + 'FK': 'Falkland Islands (Malvinas)', + 'FO': 'Faroe Islands', + 'FJ': 'Fiji', + 'FI': 'Finland', + 'FR': 'France', + 'GF': 'French Guiana', + 'PF': 'French Polynesia', + 'TF': 'French Southern Territories', + 'GA': 'Gabon', + 'GM': 'Gambia', + 'GE': 'Georgia', + 'DE': 'Germany', + 'GH': 'Ghana', + 'GI': 'Gibraltar', + 'GR': 'Greece', + 'GL': 'Greenland', + 'GD': 'Grenada', + 'GP': 'Guadeloupe', + 'GU': 'Guam', + 'GT': 'Guatemala', + 'GG': 'Guernsey', + 'GN': 'Guinea', + 'GW': 'Guinea-Bissau', + 'GY': 'Guyana', + 'HT': 'Haiti', + 'HM': 'Heard Island and McDonald Islands', + 'VA': 'Holy See (Vatican City State)', + 'HN': 'Honduras', + 'HK': 'Hong Kong', + 'HU': 'Hungary', + 'IS': 'Iceland', + 'IN': 'India', + 'ID': 'Indonesia', + 'IR': 'Iran, Islamic Republic of', + 'IQ': 'Iraq', + 'IE': 'Ireland', + 'IM': 'Isle of Man', + 'IL': 'Israel', + 'IT': 'Italy', + 'JM': 'Jamaica', + 'JP': 'Japan', + 'JE': 'Jersey', + 'JO': 'Jordan', + 'KZ': 'Kazakhstan', + 'KE': 'Kenya', + 'KI': 'Kiribati', + 'KP': 'Korea, Democratic People\'s Republic of', + 'KR': 'Korea, Republic of', + 'KW': 'Kuwait', + 'KG': 'Kyrgyzstan', + 'LA': 'Lao People\'s Democratic Republic', + 'LV': 'Latvia', + 'LB': 'Lebanon', + 'LS': 'Lesotho', + 'LR': 'Liberia', + 'LY': 'Libya', + 'LI': 'Liechtenstein', + 'LT': 'Lithuania', + 'LU': 'Luxembourg', + 'MO': 'Macao', + 'MK': 'Macedonia, the Former Yugoslav Republic of', + 'MG': 'Madagascar', + 'MW': 'Malawi', + 'MY': 'Malaysia', + 'MV': 'Maldives', + 'ML': 'Mali', + 'MT': 'Malta', + 'MH': 'Marshall Islands', + 'MQ': 'Martinique', + 'MR': 'Mauritania', + 'MU': 'Mauritius', + 'YT': 'Mayotte', + 'MX': 'Mexico', + 'FM': 'Micronesia, Federated States of', + 'MD': 'Moldova, Republic of', + 'MC': 'Monaco', + 'MN': 'Mongolia', + 'ME': 'Montenegro', + 'MS': 'Montserrat', + 'MA': 'Morocco', + 'MZ': 'Mozambique', + 'MM': 'Myanmar', + 'NA': 'Namibia', + 'NR': 'Nauru', + 'NP': 'Nepal', + 'NL': 'Netherlands', + 'NC': 'New Caledonia', + 'NZ': 'New Zealand', + 'NI': 'Nicaragua', + 'NE': 'Niger', + 'NG': 'Nigeria', + 'NU': 'Niue', + 'NF': 'Norfolk Island', + 'MP': 'Northern Mariana Islands', + 'NO': 'Norway', + 'OM': 'Oman', + 'PK': 'Pakistan', + 'PW': 'Palau', + 'PS': 'Palestine, State of', + 'PA': 'Panama', + 'PG': 'Papua New Guinea', + 'PY': 'Paraguay', + 'PE': 'Peru', + 'PH': 'Philippines', + 'PN': 'Pitcairn', + 'PL': 'Poland', + 'PT': 'Portugal', + 'PR': 'Puerto Rico', + 'QA': 'Qatar', + 'RE': 'Réunion', + 'RO': 'Romania', + 'RU': 'Russian Federation', + 'RW': 'Rwanda', + 'BL': 'Saint Barthélemy', + 'SH': 'Saint Helena, Ascension and Tristan da Cunha', + 'KN': 'Saint Kitts and Nevis', + 'LC': 'Saint Lucia', + 'MF': 'Saint Martin (French part)', + 'PM': 'Saint Pierre and Miquelon', + 'VC': 'Saint Vincent and the Grenadines', + 'WS': 'Samoa', + 'SM': 'San Marino', + 'ST': 'Sao Tome and Principe', + 'SA': 'Saudi Arabia', + 'SN': 'Senegal', + 'RS': 'Serbia', + 'SC': 'Seychelles', + 'SL': 'Sierra Leone', + 'SG': 'Singapore', + 'SX': 'Sint Maarten (Dutch part)', + 'SK': 'Slovakia', + 'SI': 'Slovenia', + 'SB': 'Solomon Islands', + 'SO': 'Somalia', + 'ZA': 'South Africa', + 'GS': 'South Georgia and the South Sandwich Islands', + 'SS': 'South Sudan', + 'ES': 'Spain', + 'LK': 'Sri Lanka', + 'SD': 'Sudan', + 'SR': 'Suriname', + 'SJ': 'Svalbard and Jan Mayen', + 'SZ': 'Swaziland', + 'SE': 'Sweden', + 'CH': 'Switzerland', + 'SY': 'Syrian Arab Republic', + 'TW': 'Taiwan, Province of China', + 'TJ': 'Tajikistan', + 'TZ': 'Tanzania, United Republic of', + 'TH': 'Thailand', + 'TL': 'Timor-Leste', + 'TG': 'Togo', + 'TK': 'Tokelau', + 'TO': 'Tonga', + 'TT': 'Trinidad and Tobago', + 'TN': 'Tunisia', + 'TR': 'Turkey', + 'TM': 'Turkmenistan', + 'TC': 'Turks and Caicos Islands', + 'TV': 'Tuvalu', + 'UG': 'Uganda', + 'UA': 'Ukraine', + 'AE': 'United Arab Emirates', + 'GB': 'United Kingdom', + 'US': 'United States', + 'UM': 'United States Minor Outlying Islands', + 'UY': 'Uruguay', + 'UZ': 'Uzbekistan', + 'VU': 'Vanuatu', + 'VE': 'Venezuela, Bolivarian Republic of', + 'VN': 'Viet Nam', + 'VG': 'Virgin Islands, British', + 'VI': 'Virgin Islands, U.S.', + 'WF': 'Wallis and Futuna', + 'EH': 'Western Sahara', + 'YE': 'Yemen', + 'ZM': 'Zambia', + 'ZW': 'Zimbabwe', + # Not ISO 3166 codes, but used for IP blocks + 'AP': 'Asia/Pacific Region', + 'EU': 'Europe', + } + + @classmethod + def short2full(cls, code): + """Convert an ISO 3166-2 country code to the corresponding full name""" + return cls._country_map.get(code.upper()) + + +class GeoUtils: + # Major IPv4 address blocks per country + _country_ip_map = { + 'AD': '46.172.224.0/19', + 'AE': '94.200.0.0/13', + 'AF': '149.54.0.0/17', + 'AG': '209.59.64.0/18', + 'AI': '204.14.248.0/21', + 'AL': '46.99.0.0/16', + 'AM': '46.70.0.0/15', + 'AO': '105.168.0.0/13', + 'AP': '182.50.184.0/21', + 'AQ': '23.154.160.0/24', + 'AR': '181.0.0.0/12', + 'AS': '202.70.112.0/20', + 'AT': '77.116.0.0/14', + 'AU': '1.128.0.0/11', + 'AW': '181.41.0.0/18', + 'AX': '185.217.4.0/22', + 'AZ': '5.197.0.0/16', + 'BA': '31.176.128.0/17', + 'BB': '65.48.128.0/17', + 'BD': '114.130.0.0/16', + 'BE': '57.0.0.0/8', + 'BF': '102.178.0.0/15', + 'BG': '95.42.0.0/15', + 'BH': '37.131.0.0/17', + 'BI': '154.117.192.0/18', + 'BJ': '137.255.0.0/16', + 'BL': '185.212.72.0/23', + 'BM': '196.12.64.0/18', + 'BN': '156.31.0.0/16', + 'BO': '161.56.0.0/16', + 'BQ': '161.0.80.0/20', + 'BR': '191.128.0.0/12', + 'BS': '24.51.64.0/18', + 'BT': '119.2.96.0/19', + 'BW': '168.167.0.0/16', + 'BY': '178.120.0.0/13', + 'BZ': '179.42.192.0/18', + 'CA': '99.224.0.0/11', + 'CD': '41.243.0.0/16', + 'CF': '197.242.176.0/21', + 'CG': '160.113.0.0/16', + 'CH': '85.0.0.0/13', + 'CI': '102.136.0.0/14', + 'CK': '202.65.32.0/19', + 'CL': '152.172.0.0/14', + 'CM': '102.244.0.0/14', + 'CN': '36.128.0.0/10', + 'CO': '181.240.0.0/12', + 'CR': '201.192.0.0/12', + 'CU': '152.206.0.0/15', + 'CV': '165.90.96.0/19', + 'CW': '190.88.128.0/17', + 'CY': '31.153.0.0/16', + 'CZ': '88.100.0.0/14', + 'DE': '53.0.0.0/8', + 'DJ': '197.241.0.0/17', + 'DK': '87.48.0.0/12', + 'DM': '192.243.48.0/20', + 'DO': '152.166.0.0/15', + 'DZ': '41.96.0.0/12', + 'EC': '186.68.0.0/15', + 'EE': '90.190.0.0/15', + 'EG': '156.160.0.0/11', + 'ER': '196.200.96.0/20', + 'ES': '88.0.0.0/11', + 'ET': '196.188.0.0/14', + 'EU': '2.16.0.0/13', + 'FI': '91.152.0.0/13', + 'FJ': '144.120.0.0/16', + 'FK': '80.73.208.0/21', + 'FM': '119.252.112.0/20', + 'FO': '88.85.32.0/19', + 'FR': '90.0.0.0/9', + 'GA': '41.158.0.0/15', + 'GB': '25.0.0.0/8', + 'GD': '74.122.88.0/21', + 'GE': '31.146.0.0/16', + 'GF': '161.22.64.0/18', + 'GG': '62.68.160.0/19', + 'GH': '154.160.0.0/12', + 'GI': '95.164.0.0/16', + 'GL': '88.83.0.0/19', + 'GM': '160.182.0.0/15', + 'GN': '197.149.192.0/18', + 'GP': '104.250.0.0/19', + 'GQ': '105.235.224.0/20', + 'GR': '94.64.0.0/13', + 'GT': '168.234.0.0/16', + 'GU': '168.123.0.0/16', + 'GW': '197.214.80.0/20', + 'GY': '181.41.64.0/18', + 'HK': '113.252.0.0/14', + 'HN': '181.210.0.0/16', + 'HR': '93.136.0.0/13', + 'HT': '148.102.128.0/17', + 'HU': '84.0.0.0/14', + 'ID': '39.192.0.0/10', + 'IE': '87.32.0.0/12', + 'IL': '79.176.0.0/13', + 'IM': '5.62.80.0/20', + 'IN': '117.192.0.0/10', + 'IO': '203.83.48.0/21', + 'IQ': '37.236.0.0/14', + 'IR': '2.176.0.0/12', + 'IS': '82.221.0.0/16', + 'IT': '79.0.0.0/10', + 'JE': '87.244.64.0/18', + 'JM': '72.27.0.0/17', + 'JO': '176.29.0.0/16', + 'JP': '133.0.0.0/8', + 'KE': '105.48.0.0/12', + 'KG': '158.181.128.0/17', + 'KH': '36.37.128.0/17', + 'KI': '103.25.140.0/22', + 'KM': '197.255.224.0/20', + 'KN': '198.167.192.0/19', + 'KP': '175.45.176.0/22', + 'KR': '175.192.0.0/10', + 'KW': '37.36.0.0/14', + 'KY': '64.96.0.0/15', + 'KZ': '2.72.0.0/13', + 'LA': '115.84.64.0/18', + 'LB': '178.135.0.0/16', + 'LC': '24.92.144.0/20', + 'LI': '82.117.0.0/19', + 'LK': '112.134.0.0/15', + 'LR': '102.183.0.0/16', + 'LS': '129.232.0.0/17', + 'LT': '78.56.0.0/13', + 'LU': '188.42.0.0/16', + 'LV': '46.109.0.0/16', + 'LY': '41.252.0.0/14', + 'MA': '105.128.0.0/11', + 'MC': '88.209.64.0/18', + 'MD': '37.246.0.0/16', + 'ME': '178.175.0.0/17', + 'MF': '74.112.232.0/21', + 'MG': '154.126.0.0/17', + 'MH': '117.103.88.0/21', + 'MK': '77.28.0.0/15', + 'ML': '154.118.128.0/18', + 'MM': '37.111.0.0/17', + 'MN': '49.0.128.0/17', + 'MO': '60.246.0.0/16', + 'MP': '202.88.64.0/20', + 'MQ': '109.203.224.0/19', + 'MR': '41.188.64.0/18', + 'MS': '208.90.112.0/22', + 'MT': '46.11.0.0/16', + 'MU': '105.16.0.0/12', + 'MV': '27.114.128.0/18', + 'MW': '102.70.0.0/15', + 'MX': '187.192.0.0/11', + 'MY': '175.136.0.0/13', + 'MZ': '197.218.0.0/15', + 'NA': '41.182.0.0/16', + 'NC': '101.101.0.0/18', + 'NE': '197.214.0.0/18', + 'NF': '203.17.240.0/22', + 'NG': '105.112.0.0/12', + 'NI': '186.76.0.0/15', + 'NL': '145.96.0.0/11', + 'NO': '84.208.0.0/13', + 'NP': '36.252.0.0/15', + 'NR': '203.98.224.0/19', + 'NU': '49.156.48.0/22', + 'NZ': '49.224.0.0/14', + 'OM': '5.36.0.0/15', + 'PA': '186.72.0.0/15', + 'PE': '186.160.0.0/14', + 'PF': '123.50.64.0/18', + 'PG': '124.240.192.0/19', + 'PH': '49.144.0.0/13', + 'PK': '39.32.0.0/11', + 'PL': '83.0.0.0/11', + 'PM': '70.36.0.0/20', + 'PR': '66.50.0.0/16', + 'PS': '188.161.0.0/16', + 'PT': '85.240.0.0/13', + 'PW': '202.124.224.0/20', + 'PY': '181.120.0.0/14', + 'QA': '37.210.0.0/15', + 'RE': '102.35.0.0/16', + 'RO': '79.112.0.0/13', + 'RS': '93.86.0.0/15', + 'RU': '5.136.0.0/13', + 'RW': '41.186.0.0/16', + 'SA': '188.48.0.0/13', + 'SB': '202.1.160.0/19', + 'SC': '154.192.0.0/11', + 'SD': '102.120.0.0/13', + 'SE': '78.64.0.0/12', + 'SG': '8.128.0.0/10', + 'SI': '188.196.0.0/14', + 'SK': '78.98.0.0/15', + 'SL': '102.143.0.0/17', + 'SM': '89.186.32.0/19', + 'SN': '41.82.0.0/15', + 'SO': '154.115.192.0/18', + 'SR': '186.179.128.0/17', + 'SS': '105.235.208.0/21', + 'ST': '197.159.160.0/19', + 'SV': '168.243.0.0/16', + 'SX': '190.102.0.0/20', + 'SY': '5.0.0.0/16', + 'SZ': '41.84.224.0/19', + 'TC': '65.255.48.0/20', + 'TD': '154.68.128.0/19', + 'TG': '196.168.0.0/14', + 'TH': '171.96.0.0/13', + 'TJ': '85.9.128.0/18', + 'TK': '27.96.24.0/21', + 'TL': '180.189.160.0/20', + 'TM': '95.85.96.0/19', + 'TN': '197.0.0.0/11', + 'TO': '175.176.144.0/21', + 'TR': '78.160.0.0/11', + 'TT': '186.44.0.0/15', + 'TV': '202.2.96.0/19', + 'TW': '120.96.0.0/11', + 'TZ': '156.156.0.0/14', + 'UA': '37.52.0.0/14', + 'UG': '102.80.0.0/13', + 'US': '6.0.0.0/8', + 'UY': '167.56.0.0/13', + 'UZ': '84.54.64.0/18', + 'VA': '212.77.0.0/19', + 'VC': '207.191.240.0/21', + 'VE': '186.88.0.0/13', + 'VG': '66.81.192.0/20', + 'VI': '146.226.0.0/16', + 'VN': '14.160.0.0/11', + 'VU': '202.80.32.0/20', + 'WF': '117.20.32.0/21', + 'WS': '202.4.32.0/19', + 'YE': '134.35.0.0/16', + 'YT': '41.242.116.0/22', + 'ZA': '41.0.0.0/11', + 'ZM': '102.144.0.0/13', + 'ZW': '102.177.192.0/18', + } + + @classmethod + def random_ipv4(cls, code_or_block): + if len(code_or_block) == 2: + block = cls._country_ip_map.get(code_or_block.upper()) + if not block: + return None + else: + block = code_or_block + addr, preflen = block.split('/') + addr_min = struct.unpack('!L', socket.inet_aton(addr))[0] + addr_max = addr_min | (0xffffffff >> int(preflen)) + return str(socket.inet_ntoa( + struct.pack('!L', random.randint(addr_min, addr_max)))) + + +class PerRequestProxyHandler(urllib.request.ProxyHandler): + def __init__(self, proxies=None): + # Set default handlers + for type in ('http', 'https'): + setattr(self, '%s_open' % type, + lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: + meth(r, proxy, type)) + urllib.request.ProxyHandler.__init__(self, proxies) + + def proxy_open(self, req, proxy, type): + req_proxy = req.headers.get('Ytdl-request-proxy') + if req_proxy is not None: + proxy = req_proxy + del req.headers['Ytdl-request-proxy'] + + if proxy == '__noproxy__': + return None # No Proxy + if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + req.add_header('Ytdl-socks-proxy', proxy) + # yt-dlp's http/https handlers do wrapping the socket with socks + return None + return urllib.request.ProxyHandler.proxy_open( + self, req, proxy, type) + + +# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is +# released into Public Domain +# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 + +def long_to_bytes(n, blocksize=0): + """long_to_bytes(n:long, blocksize:int) : string + Convert a long integer to a byte string. + + If optional blocksize is given and greater than zero, pad the front of the + byte string with binary zeros so that the length is a multiple of + blocksize. + """ + # after much testing, this algorithm was deemed to be the fastest + s = b'' + n = int(n) + while n > 0: + s = struct.pack('>I', n & 0xffffffff) + s + n = n >> 32 + # strip off leading zeros + for i in range(len(s)): + if s[i] != b'\000'[0]: + break + else: + # only happens when n == 0 + s = b'\000' + i = 0 + s = s[i:] + # add back some pad bytes. this could be done more efficiently w.r.t. the + # de-padding being done above, but sigh... + if blocksize > 0 and len(s) % blocksize: + s = (blocksize - len(s) % blocksize) * b'\000' + s + return s + + +def bytes_to_long(s): + """bytes_to_long(string) : long + Convert a byte string to a long integer. + + This is (essentially) the inverse of long_to_bytes(). + """ + acc = 0 + length = len(s) + if length % 4: + extra = (4 - length % 4) + s = b'\000' * extra + s + length = length + extra + for i in range(0, length, 4): + acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0] + return acc + + +def ohdave_rsa_encrypt(data, exponent, modulus): + ''' + Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ + + Input: + data: data to encrypt, bytes-like object + exponent, modulus: parameter e and N of RSA algorithm, both integer + Output: hex string of encrypted data + + Limitation: supports one block encryption only + ''' + + payload = int(binascii.hexlify(data[::-1]), 16) + encrypted = pow(payload, exponent, modulus) + return '%x' % encrypted + + +def pkcs1pad(data, length): + """ + Padding input data with PKCS#1 scheme + + @param {int[]} data input data + @param {int} length target length + @returns {int[]} padded data + """ + if len(data) > length - 11: + raise ValueError('Input data too long for PKCS#1 padding') + + pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)] + return [0, 2] + pseudo_random + [0] + data + + +def _base_n_table(n, table): + if not table and not n: + raise ValueError('Either table or n must be specified') + table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n] + + if n and n != len(table): + raise ValueError(f'base {n} exceeds table length {len(table)}') + return table + + +def encode_base_n(num, n=None, table=None): + """Convert given int to a base-n string""" + table = _base_n_table(n, table) + if not num: + return table[0] + + result, base = '', len(table) + while num: + result = table[num % base] + result + num = num // base + return result + + +def decode_base_n(string, n=None, table=None): + """Convert given base-n string to int""" + table = {char: index for index, char in enumerate(_base_n_table(n, table))} + result, base = 0, len(table) + for char in string: + result = result * base + table[char] + return result + + +def decode_packed_codes(code): + mobj = re.search(PACKED_CODES_RE, code) + obfuscated_code, base, count, symbols = mobj.groups() + base = int(base) + count = int(count) + symbols = symbols.split('|') + symbol_table = {} + + while count: + count -= 1 + base_n_count = encode_base_n(count, base) + symbol_table[base_n_count] = symbols[count] or base_n_count + + return re.sub( + r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], + obfuscated_code) + + +def caesar(s, alphabet, shift): + if shift == 0: + return s + l = len(alphabet) + return ''.join( + alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c + for c in s) + + +def rot47(s): + return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47) + + +def parse_m3u8_attributes(attrib): + info = {} + for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): + if val.startswith('"'): + val = val[1:-1] + info[key] = val + return info + + +def urshift(val, n): + return val >> n if val >= 0 else (val + 0x100000000) >> n + + +def write_xattr(path, key, value): + # Windows: Write xattrs to NTFS Alternate Data Streams: + # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 + if compat_os_name == 'nt': + assert ':' not in key + assert os.path.exists(path) + + try: + with open(f'{path}:{key}', 'wb') as f: + f.write(value) + except OSError as e: + raise XAttrMetadataError(e.errno, e.strerror) + return + + # UNIX Method 1. Use xattrs/pyxattrs modules + + setxattr = None + if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': + # Unicode arguments are not supported in pyxattr until version 0.5.0 + # See https://github.com/ytdl-org/youtube-dl/issues/5498 + if version_tuple(xattr.__version__) >= (0, 5, 0): + setxattr = xattr.set + elif xattr: + setxattr = xattr.setxattr + + if setxattr: + try: + setxattr(path, key, value) + except OSError as e: + raise XAttrMetadataError(e.errno, e.strerror) + return + + # UNIX Method 2. Use setfattr/xattr executables + exe = ('setfattr' if check_executable('setfattr', ['--version']) + else 'xattr' if check_executable('xattr', ['-h']) else None) + if not exe: + raise XAttrUnavailableError( + 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the ' + + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)')) + + value = value.decode() + try: + _, stderr, returncode = Popen.run( + [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path], + text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + except OSError as e: + raise XAttrMetadataError(e.errno, e.strerror) + if returncode: + raise XAttrMetadataError(returncode, stderr) + + +def random_birthday(year_field, month_field, day_field): + start_date = datetime.date(1950, 1, 1) + end_date = datetime.date(1995, 12, 31) + offset = random.randint(0, (end_date - start_date).days) + random_date = start_date + datetime.timedelta(offset) + return { + year_field: str(random_date.year), + month_field: str(random_date.month), + day_field: str(random_date.day), + } + + +def find_available_port(interface=''): + try: + with socket.socket() as sock: + sock.bind((interface, 0)) + return sock.getsockname()[1] + except OSError: + return None + + +# Templates for internet shortcut files, which are plain text files. +DOT_URL_LINK_TEMPLATE = '''\ +[InternetShortcut] +URL=%(url)s +''' + +DOT_WEBLOC_LINK_TEMPLATE = '''\ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> +<plist version="1.0"> +<dict> +\t<key>URL</key> +\t<string>%(url)s</string> +</dict> +</plist> +''' + +DOT_DESKTOP_LINK_TEMPLATE = '''\ +[Desktop Entry] +Encoding=UTF-8 +Name=%(filename)s +Type=Link +URL=%(url)s +Icon=text-html +''' + +LINK_TEMPLATES = { + 'url': DOT_URL_LINK_TEMPLATE, + 'desktop': DOT_DESKTOP_LINK_TEMPLATE, + 'webloc': DOT_WEBLOC_LINK_TEMPLATE, +} + + +def iri_to_uri(iri): + """ + Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only). + + The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact. + """ + + iri_parts = urllib.parse.urlparse(iri) + + if '[' in iri_parts.netloc: + raise ValueError('IPv6 URIs are not, yet, supported.') + # Querying `.netloc`, when there's only one bracket, also raises a ValueError. + + # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is. + + net_location = '' + if iri_parts.username: + net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~") + if iri_parts.password is not None: + net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~") + net_location += '@' + + net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames. + # The 'idna' encoding produces ASCII text. + if iri_parts.port is not None and iri_parts.port != 80: + net_location += ':' + str(iri_parts.port) + + return urllib.parse.urlunparse( + (iri_parts.scheme, + net_location, + + urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"), + + # Unsure about the `safe` argument, since this is a legacy way of handling parameters. + urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"), + + # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component. + urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"), + + urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~"))) + + # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes. + + +def to_high_limit_path(path): + if sys.platform in ['win32', 'cygwin']: + # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited. + return '\\\\?\\' + os.path.abspath(path) + + return path + + +def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): + val = traversal.traverse_obj(obj, *variadic(field)) + if not val if ignore is NO_DEFAULT else val in variadic(ignore): + return default + return template % func(val) + + +def clean_podcast_url(url): + return re.sub(r'''(?x) + (?: + (?: + chtbl\.com/track| + media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ + play\.podtrac\.com + )/[^/]+| + (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure + flex\.acast\.com| + pd(?: + cn\.co| # https://podcorn.com/analytics-prefix/ + st\.fm # https://podsights.com/docs/ + )/e + )/''', '', url) + + +_HEX_TABLE = '0123456789abcdef' + + +def random_uuidv4(): + return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx') + + +def make_dir(path, to_screen=None): + try: + dn = os.path.dirname(path) + if dn: + os.makedirs(dn, exist_ok=True) + return True + except OSError as err: + if callable(to_screen) is not None: + to_screen(f'unable to create directory {err}') + return False + + +def get_executable_path(): + from ..update import _get_variant_and_executable_path + + return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1])) + + +def get_user_config_dirs(package_name): + # .config (e.g. ~/.config/package_name) + xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') + yield os.path.join(xdg_config_home, package_name) + + # appdata (%APPDATA%/package_name) + appdata_dir = os.getenv('appdata') + if appdata_dir: + yield os.path.join(appdata_dir, package_name) + + # home (~/.package_name) + yield os.path.join(compat_expanduser('~'), f'.{package_name}') + + +def get_system_config_dirs(package_name): + # /etc/package_name + yield os.path.join('/etc', package_name) + + +def time_seconds(**kwargs): + """ + Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z) + """ + return time.time() + datetime.timedelta(**kwargs).total_seconds() + + +# create a JSON Web Signature (jws) with HS256 algorithm +# the resulting format is in JWS Compact Serialization +# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html +# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html +def jwt_encode_hs256(payload_data, key, headers={}): + header_data = { + 'alg': 'HS256', + 'typ': 'JWT', + } + if headers: + header_data.update(headers) + header_b64 = base64.b64encode(json.dumps(header_data).encode()) + payload_b64 = base64.b64encode(json.dumps(payload_data).encode()) + h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256) + signature_b64 = base64.b64encode(h.digest()) + token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64 + return token + + +# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256 +def jwt_decode_hs256(jwt): + header_b64, payload_b64, signature_b64 = jwt.split('.') + # add trailing ='s that may have been stripped, superfluous ='s are ignored + payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}===')) + return payload_data + + +WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None + + +@functools.cache +def supports_terminal_sequences(stream): + if compat_os_name == 'nt': + if not WINDOWS_VT_MODE: + return False + elif not os.getenv('TERM'): + return False + try: + return stream.isatty() + except BaseException: + return False + + +def windows_enable_vt_mode(): + """Ref: https://bugs.python.org/issue30075 """ + if get_windows_version() < (10, 0, 10586): + return + + import ctypes + import ctypes.wintypes + import msvcrt + + ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004 + + dll = ctypes.WinDLL('kernel32', use_last_error=False) + handle = os.open('CONOUT$', os.O_RDWR) + try: + h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle)) + dw_original_mode = ctypes.wintypes.DWORD() + success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode)) + if not success: + raise Exception('GetConsoleMode failed') + + success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD( + dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) + if not success: + raise Exception('SetConsoleMode failed') + finally: + os.close(handle) + + global WINDOWS_VT_MODE + WINDOWS_VT_MODE = True + supports_terminal_sequences.cache_clear() + + +_terminal_sequences_re = re.compile('\033\\[[^m]+m') + + +def remove_terminal_sequences(string): + return _terminal_sequences_re.sub('', string) + + +def number_of_digits(number): + return len('%d' % number) + + +def join_nonempty(*values, delim='-', from_dict=None): + if from_dict is not None: + values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values) + return delim.join(map(str, filter(None, values))) + + +def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re): + """ + Find the largest format dimensions in terms of video width and, for each thumbnail: + * Modify the URL: Match the width with the provided regex and replace with the former width + * Update dimensions + + This function is useful with video services that scale the provided thumbnails on demand + """ + _keys = ('width', 'height') + max_dimensions = max( + (tuple(format.get(k) or 0 for k in _keys) for format in formats), + default=(0, 0)) + if not max_dimensions[0]: + return thumbnails + return [ + merge_dicts( + {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}, + dict(zip(_keys, max_dimensions)), thumbnail) + for thumbnail in thumbnails + ] + + +def parse_http_range(range): + """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """ + if not range: + return None, None, None + crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range) + if not crg: + return None, None, None + return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3)) + + +def read_stdin(what): + eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D' + write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n') + return sys.stdin + + +def determine_file_encoding(data): + """ + Detect the text encoding used + @returns (encoding, bytes to skip) + """ + + # BOM marks are given priority over declarations + for bom, enc in BOMS: + if data.startswith(bom): + return enc, len(bom) + + # Strip off all null bytes to match even when UTF-16 or UTF-32 is used. + # We ignore the endianness to get a good enough match + data = data.replace(b'\0', b'') + mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data) + return mobj.group(1).decode() if mobj else None, 0 + + +class Config: + own_args = None + parsed_args = None + filename = None + __initialized = False + + def __init__(self, parser, label=None): + self.parser, self.label = parser, label + self._loaded_paths, self.configs = set(), [] + + def init(self, args=None, filename=None): + assert not self.__initialized + self.own_args, self.filename = args, filename + return self.load_configs() + + def load_configs(self): + directory = '' + if self.filename: + location = os.path.realpath(self.filename) + directory = os.path.dirname(location) + if location in self._loaded_paths: + return False + self._loaded_paths.add(location) + + self.__initialized = True + opts, _ = self.parser.parse_known_args(self.own_args) + self.parsed_args = self.own_args + for location in opts.config_locations or []: + if location == '-': + if location in self._loaded_paths: + continue + self._loaded_paths.add(location) + self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin') + continue + location = os.path.join(directory, expand_path(location)) + if os.path.isdir(location): + location = os.path.join(location, 'yt-dlp.conf') + if not os.path.exists(location): + self.parser.error(f'config location {location} does not exist') + self.append_config(self.read_file(location), location) + return True + + def __str__(self): + label = join_nonempty( + self.label, 'config', f'"{self.filename}"' if self.filename else '', + delim=' ') + return join_nonempty( + self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}', + *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs), + delim='\n') + + @staticmethod + def read_file(filename, default=[]): + try: + optionf = open(filename, 'rb') + except OSError: + return default # silently skip if file is not present + try: + enc, skip = determine_file_encoding(optionf.read(512)) + optionf.seek(skip, io.SEEK_SET) + except OSError: + enc = None # silently skip read errors + try: + # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 + contents = optionf.read().decode(enc or preferredencoding()) + res = shlex.split(contents, comments=True) + except Exception as err: + raise ValueError(f'Unable to parse "{filename}": {err}') + finally: + optionf.close() + return res + + @staticmethod + def hide_login_info(opts): + PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'} + eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') + + def _scrub_eq(o): + m = eqre.match(o) + if m: + return m.group('key') + '=PRIVATE' + else: + return o + + opts = list(map(_scrub_eq, opts)) + for idx, opt in enumerate(opts): + if opt in PRIVATE_OPTS and idx + 1 < len(opts): + opts[idx + 1] = 'PRIVATE' + return opts + + def append_config(self, *args, label=None): + config = type(self)(self.parser, label) + config._loaded_paths = self._loaded_paths + if config.init(*args): + self.configs.append(config) + + @property + def all_args(self): + for config in reversed(self.configs): + yield from config.all_args + yield from self.parsed_args or [] + + def parse_known_args(self, **kwargs): + return self.parser.parse_known_args(self.all_args, **kwargs) + + def parse_args(self): + return self.parser.parse_args(self.all_args) + + +class WebSocketsWrapper: + """Wraps websockets module to use in non-async scopes""" + pool = None + + def __init__(self, url, headers=None, connect=True): + self.loop = asyncio.new_event_loop() + # XXX: "loop" is deprecated + self.conn = websockets.connect( + url, extra_headers=headers, ping_interval=None, + close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf')) + if connect: + self.__enter__() + atexit.register(self.__exit__, None, None, None) + + def __enter__(self): + if not self.pool: + self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop) + return self + + def send(self, *args): + self.run_with_loop(self.pool.send(*args), self.loop) + + def recv(self, *args): + return self.run_with_loop(self.pool.recv(*args), self.loop) + + def __exit__(self, type, value, traceback): + try: + return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop) + finally: + self.loop.close() + self._cancel_all_tasks(self.loop) + + # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications + # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class + @staticmethod + def run_with_loop(main, loop): + if not asyncio.iscoroutine(main): + raise ValueError(f'a coroutine was expected, got {main!r}') + + try: + return loop.run_until_complete(main) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) + if hasattr(loop, 'shutdown_default_executor'): + loop.run_until_complete(loop.shutdown_default_executor()) + + @staticmethod + def _cancel_all_tasks(loop): + to_cancel = asyncio.all_tasks(loop) + + if not to_cancel: + return + + for task in to_cancel: + task.cancel() + + # XXX: "loop" is removed in python 3.10+ + loop.run_until_complete( + asyncio.gather(*to_cancel, loop=loop, return_exceptions=True)) + + for task in to_cancel: + if task.cancelled(): + continue + if task.exception() is not None: + loop.call_exception_handler({ + 'message': 'unhandled exception during asyncio.run() shutdown', + 'exception': task.exception(), + 'task': task, + }) + + +def merge_headers(*dicts): + """Merge dicts of http headers case insensitively, prioritizing the latter ones""" + return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))} + + +def cached_method(f): + """Cache a method""" + signature = inspect.signature(f) + + @functools.wraps(f) + def wrapper(self, *args, **kwargs): + bound_args = signature.bind(self, *args, **kwargs) + bound_args.apply_defaults() + key = tuple(bound_args.arguments.values())[1:] + + cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {}) + if key not in cache: + cache[key] = f(self, *args, **kwargs) + return cache[key] + return wrapper + + +class classproperty: + """property access for class methods with optional caching""" + def __new__(cls, func=None, *args, **kwargs): + if not func: + return functools.partial(cls, *args, **kwargs) + return super().__new__(cls) + + def __init__(self, func, *, cache=False): + functools.update_wrapper(self, func) + self.func = func + self._cache = {} if cache else None + + def __get__(self, _, cls): + if self._cache is None: + return self.func(cls) + elif cls not in self._cache: + self._cache[cls] = self.func(cls) + return self._cache[cls] + + +class function_with_repr: + def __init__(self, func, repr_=None): + functools.update_wrapper(self, func) + self.func, self.__repr = func, repr_ + + def __call__(self, *args, **kwargs): + return self.func(*args, **kwargs) + + def __repr__(self): + if self.__repr: + return self.__repr + return f'{self.func.__module__}.{self.func.__qualname__}' + + +class Namespace(types.SimpleNamespace): + """Immutable namespace""" + + def __iter__(self): + return iter(self.__dict__.values()) + + @property + def items_(self): + return self.__dict__.items() + + +MEDIA_EXTENSIONS = Namespace( + common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'), + video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'), + common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'), + audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'), + thumbnails=('jpg', 'png', 'webp'), + storyboards=('mhtml', ), + subtitles=('srt', 'vtt', 'ass', 'lrc'), + manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'), +) +MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video +MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio + +KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests) + + +class RetryManager: + """Usage: + for retry in RetryManager(...): + try: + ... + except SomeException as err: + retry.error = err + continue + """ + attempt, _error = 0, None + + def __init__(self, _retries, _error_callback, **kwargs): + self.retries = _retries or 0 + self.error_callback = functools.partial(_error_callback, **kwargs) + + def _should_retry(self): + return self._error is not NO_DEFAULT and self.attempt <= self.retries + + @property + def error(self): + if self._error is NO_DEFAULT: + return None + return self._error + + @error.setter + def error(self, value): + self._error = value + + def __iter__(self): + while self._should_retry(): + self.error = NO_DEFAULT + self.attempt += 1 + yield self + if self.error: + self.error_callback(self.error, self.attempt, self.retries) + + @staticmethod + def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None): + """Utility function for reporting retries""" + if count > retries: + if error: + return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e)) + raise e + + if not count: + return warn(e) + elif isinstance(e, ExtractorError): + e = remove_end(str_or_none(e.cause) or e.orig_msg, '.') + warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...') + + delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func + if delay: + info(f'Sleeping {delay:.2f} seconds ...') + time.sleep(delay) + + +def make_archive_id(ie, video_id): + ie_key = ie if isinstance(ie, str) else ie.ie_key() + return f'{ie_key.lower()} {video_id}' + + +def truncate_string(s, left, right=0): + assert left > 3 and right >= 0 + if s is None or len(s) <= left + right: + return s + return f'{s[:left-3]}...{s[-right:] if right else ""}' + + +def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None): + assert 'all' in alias_dict, '"all" alias is required' + requested = list(start or []) + for val in options: + discard = val.startswith('-') + if discard: + val = val[1:] + + if val in alias_dict: + val = alias_dict[val] if not discard else [ + i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]] + # NB: Do not allow regex in aliases for performance + requested = orderedSet_from_options(val, alias_dict, start=requested) + continue + + current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex + else [val] if val in alias_dict['all'] else None) + if current is None: + raise ValueError(val) + + if discard: + for item in current: + while item in requested: + requested.remove(item) + else: + requested.extend(current) + + return orderedSet(requested) + + +# TODO: Rewrite +class FormatSorter: + regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$' + + default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', + 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec', + 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases + ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', + 'height', 'width', 'proto', 'vext', 'abr', 'aext', + 'fps', 'fs_approx', 'source', 'id') + + settings = { + 'vcodec': {'type': 'ordered', 'regex': True, + 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, + 'acodec': {'type': 'ordered', 'regex': True, + 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, + 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', + 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, + 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', + 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, + 'vext': {'type': 'ordered', 'field': 'video_ext', + 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'), + 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')}, + 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext', + 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'), + 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')}, + 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, + 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', + 'field': ('vcodec', 'acodec'), + 'function': lambda it: int(any(v != 'none' for v in it))}, + 'ie_pref': {'priority': True, 'type': 'extractor'}, + 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, + 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, + 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, + 'quality': {'convert': 'float', 'default': -1}, + 'filesize': {'convert': 'bytes'}, + 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, + 'id': {'convert': 'string', 'field': 'format_id'}, + 'height': {'convert': 'float_none'}, + 'width': {'convert': 'float_none'}, + 'fps': {'convert': 'float_none'}, + 'channels': {'convert': 'float_none', 'field': 'audio_channels'}, + 'tbr': {'convert': 'float_none'}, + 'vbr': {'convert': 'float_none'}, + 'abr': {'convert': 'float_none'}, + 'asr': {'convert': 'float_none'}, + 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, + + 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, + 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), + 'function': lambda it: next(filter(None, it), None)}, + 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), + 'function': lambda it: next(filter(None, it), None)}, + 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, + 'res': {'type': 'multiple', 'field': ('height', 'width'), + 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, + + # Actual field names + 'format_id': {'type': 'alias', 'field': 'id'}, + 'preference': {'type': 'alias', 'field': 'ie_pref'}, + 'language_preference': {'type': 'alias', 'field': 'lang'}, + 'source_preference': {'type': 'alias', 'field': 'source'}, + 'protocol': {'type': 'alias', 'field': 'proto'}, + 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, + 'audio_channels': {'type': 'alias', 'field': 'channels'}, + + # Deprecated + 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, + 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, + 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, + 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, + 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, + 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, + 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, + 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, + 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, + 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, + 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, + 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, + 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, + 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, + } + + def __init__(self, ydl, field_preference): + self.ydl = ydl + self._order = [] + self.evaluate_params(self.ydl.params, field_preference) + if ydl.params.get('verbose'): + self.print_verbose_info(self.ydl.write_debug) + + def _get_field_setting(self, field, key): + if field not in self.settings: + if key in ('forced', 'priority'): + return False + self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is ' + 'deprecated and may be removed in a future version') + self.settings[field] = {} + propObj = self.settings[field] + if key not in propObj: + type = propObj.get('type') + if key == 'field': + default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field + elif key == 'convert': + default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore' + else: + default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None) + propObj[key] = default + return propObj[key] + + def _resolve_field_value(self, field, value, convertNone=False): + if value is None: + if not convertNone: + return None + else: + value = value.lower() + conversion = self._get_field_setting(field, 'convert') + if conversion == 'ignore': + return None + if conversion == 'string': + return value + elif conversion == 'float_none': + return float_or_none(value) + elif conversion == 'bytes': + return parse_bytes(value) + elif conversion == 'order': + order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order') + use_regex = self._get_field_setting(field, 'regex') + list_length = len(order_list) + empty_pos = order_list.index('') if '' in order_list else list_length + 1 + if use_regex and value is not None: + for i, regex in enumerate(order_list): + if regex and re.match(regex, value): + return list_length - i + return list_length - empty_pos # not in list + else: # not regex or value = None + return list_length - (order_list.index(value) if value in order_list else empty_pos) + else: + if value.isnumeric(): + return float(value) + else: + self.settings[field]['convert'] = 'string' + return value + + def evaluate_params(self, params, sort_extractor): + self._use_free_order = params.get('prefer_free_formats', False) + self._sort_user = params.get('format_sort', []) + self._sort_extractor = sort_extractor + + def add_item(field, reverse, closest, limit_text): + field = field.lower() + if field in self._order: + return + self._order.append(field) + limit = self._resolve_field_value(field, limit_text) + data = { + 'reverse': reverse, + 'closest': False if limit is None else closest, + 'limit_text': limit_text, + 'limit': limit} + if field in self.settings: + self.settings[field].update(data) + else: + self.settings[field] = data + + sort_list = ( + tuple(field for field in self.default if self._get_field_setting(field, 'forced')) + + (tuple() if params.get('format_sort_force', False) + else tuple(field for field in self.default if self._get_field_setting(field, 'priority'))) + + tuple(self._sort_user) + tuple(sort_extractor) + self.default) + + for item in sort_list: + match = re.match(self.regex, item) + if match is None: + raise ExtractorError('Invalid format sort string "%s" given by extractor' % item) + field = match.group('field') + if field is None: + continue + if self._get_field_setting(field, 'type') == 'alias': + alias, field = field, self._get_field_setting(field, 'field') + if self._get_field_setting(alias, 'deprecated'): + self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may ' + f'be removed in a future version. Please use {field} instead') + reverse = match.group('reverse') is not None + closest = match.group('separator') == '~' + limit_text = match.group('limit') + + has_limit = limit_text is not None + has_multiple_fields = self._get_field_setting(field, 'type') == 'combined' + has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') + + fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) + limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() + limit_count = len(limits) + for (i, f) in enumerate(fields): + add_item(f, reverse, closest, + limits[i] if i < limit_count + else limits[0] if has_limit and not has_multiple_limits + else None) + + def print_verbose_info(self, write_debug): + if self._sort_user: + write_debug('Sort order given by user: %s' % ', '.join(self._sort_user)) + if self._sort_extractor: + write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor)) + write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % ( + '+' if self._get_field_setting(field, 'reverse') else '', field, + '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':', + self._get_field_setting(field, 'limit_text'), + self._get_field_setting(field, 'limit')) + if self._get_field_setting(field, 'limit_text') is not None else '') + for field in self._order if self._get_field_setting(field, 'visible')])) + + def _calculate_field_preference_from_value(self, format, field, type, value): + reverse = self._get_field_setting(field, 'reverse') + closest = self._get_field_setting(field, 'closest') + limit = self._get_field_setting(field, 'limit') + + if type == 'extractor': + maximum = self._get_field_setting(field, 'max') + if value is None or (maximum is not None and value >= maximum): + value = -1 + elif type == 'boolean': + in_list = self._get_field_setting(field, 'in_list') + not_in_list = self._get_field_setting(field, 'not_in_list') + value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1 + elif type == 'ordered': + value = self._resolve_field_value(field, value, True) + + # try to convert to number + val_num = float_or_none(value, default=self._get_field_setting(field, 'default')) + is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None + if is_num: + value = val_num + + return ((-10, 0) if value is None + else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher + else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest + else (0, value, 0) if not reverse and (limit is None or value <= limit) + else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit + else (-1, value, 0)) + + def _calculate_field_preference(self, format, field): + type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple + get_value = lambda f: format.get(self._get_field_setting(f, 'field')) + if type == 'multiple': + type = 'field' # Only 'field' is allowed in multiple for now + actual_fields = self._get_field_setting(field, 'field') + + value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields) + else: + value = get_value(field) + return self._calculate_field_preference_from_value(format, field, type, value) + + def calculate_preference(self, format): + # Determine missing protocol + if not format.get('protocol'): + format['protocol'] = determine_protocol(format) + + # Determine missing ext + if not format.get('ext') and 'url' in format: + format['ext'] = determine_ext(format['url']) + if format.get('vcodec') == 'none': + format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' + format['video_ext'] = 'none' + else: + format['video_ext'] = format['ext'] + format['audio_ext'] = 'none' + # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported? + # format['preference'] = -1000 + + if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''): + # HEVC-over-FLV is out-of-spec by FLV's original spec + # ref. https://trac.ffmpeg.org/ticket/6389 + # ref. https://github.com/yt-dlp/yt-dlp/pull/5821 + format['preference'] = -100 + + # Determine missing bitrates + if format.get('vcodec') == 'none': + format['vbr'] = 0 + if format.get('acodec') == 'none': + format['abr'] = 0 + if not format.get('vbr') and format.get('vcodec') != 'none': + format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None + if not format.get('abr') and format.get('acodec') != 'none': + format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None + if not format.get('tbr'): + format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None + + return tuple(self._calculate_field_preference(format, field) for field in self._order) diff --git a/plugin/yt-dlp/yt_dlp/utils/traversal.py b/plugin/yt-dlp/yt_dlp/utils/traversal.py new file mode 100644 index 0000000..462c3ba --- /dev/null +++ b/plugin/yt-dlp/yt_dlp/utils/traversal.py @@ -0,0 +1,254 @@ +import collections.abc +import contextlib +import inspect +import itertools +import re + +from ._utils import ( + IDENTITY, + NO_DEFAULT, + LazyList, + int_or_none, + is_iterable_like, + try_call, + variadic, +) + + +def traverse_obj( + obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, + casesense=True, is_user_input=False, traverse_string=False): + """ + Safely traverse nested `dict`s and `Iterable`s + + >>> obj = [{}, {"key": "value"}] + >>> traverse_obj(obj, (1, "key")) + "value" + + Each of the provided `paths` is tested and the first producing a valid result will be returned. + The next path will also be tested if the path branched but no results could be found. + Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. + Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. + + The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. + + The keys in the path can be one of: + - `None`: Return the current object. + - `set`: Requires the only item in the set to be a type or function, + like `{type}`/`{func}`. If a `type`, returns only values + of this type. If a function, returns `func(obj)`. + - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. + - `slice`: Branch out and return all values in `obj[key]`. + - `Ellipsis`: Branch out and return a list of all values. + - `tuple`/`list`: Branch out and return a list of all matching values. + Read as: `[traverse_obj(obj, branch) for branch in branches]`. + - `function`: Branch out and return values filtered by the function. + Read as: `[value for key, value in obj if function(key, value)]`. + For `Iterable`s, `key` is the index of the value. + For `re.Match`es, `key` is the group number (0 = full match) + as well as additionally any group names, if given. + - `dict` Transform the current object and return a matching dict. + Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + + `tuple`, `list`, and `dict` all support nested paths and branches. + + @params paths Paths which to traverse by. + @param default Value to return if the paths do not match. + If the last key in the path is a `dict`, it will apply to each value inside + the dict instead, depth first. Try to avoid if using nested `dict` keys. + @param expected_type If a `type`, only accept final values of this type. + If any other callable, try to call the function on each result. + If the last key in the path is a `dict`, it will apply to each value inside + the dict instead, recursively. This does respect branching paths. + @param get_all If `False`, return the first matching result, otherwise all matching ones. + @param casesense If `False`, consider string dictionary keys as case insensitive. + + The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + + @param is_user_input Whether the keys are generated from user input. + If `True` strings get converted to `int`/`slice` if needed. + @param traverse_string Whether to traverse into objects as strings. + If `True`, any non-compatible object will first be + converted into a string and then traversed into. + The return value of that path will be a string instead, + not respecting any further branching. + + + @returns The result of the object traversal. + If successful, `get_all=True`, and the path branches at least once, + then a list of results is returned instead. + If no `default` is given and the last path branches, a `list` of results + is always returned. If a path ends on a `dict` that result will always be a `dict`. + """ + casefold = lambda k: k.casefold() if isinstance(k, str) else k + + if isinstance(expected_type, type): + type_test = lambda val: val if isinstance(val, expected_type) else None + else: + type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) + + def apply_key(key, obj, is_last): + branching = False + result = None + + if obj is None and traverse_string: + if key is ... or callable(key) or isinstance(key, slice): + branching = True + result = () + + elif key is None: + result = obj + + elif isinstance(key, set): + assert len(key) == 1, 'Set should only be used to wrap a single item' + item = next(iter(key)) + if isinstance(item, type): + if isinstance(obj, item): + result = obj + else: + result = try_call(item, args=(obj,)) + + elif isinstance(key, (list, tuple)): + branching = True + result = itertools.chain.from_iterable( + apply_path(obj, branch, is_last)[0] for branch in key) + + elif key is ...: + branching = True + if isinstance(obj, collections.abc.Mapping): + result = obj.values() + elif is_iterable_like(obj): + result = obj + elif isinstance(obj, re.Match): + result = obj.groups() + elif traverse_string: + branching = False + result = str(obj) + else: + result = () + + elif callable(key): + branching = True + if isinstance(obj, collections.abc.Mapping): + iter_obj = obj.items() + elif is_iterable_like(obj): + iter_obj = enumerate(obj) + elif isinstance(obj, re.Match): + iter_obj = itertools.chain( + enumerate((obj.group(), *obj.groups())), + obj.groupdict().items()) + elif traverse_string: + branching = False + iter_obj = enumerate(str(obj)) + else: + iter_obj = () + + result = (v for k, v in iter_obj if try_call(key, args=(k, v))) + if not branching: # string traversal + result = ''.join(result) + + elif isinstance(key, dict): + iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items()) + result = { + k: v if v is not None else default for k, v in iter_obj + if v is not None or default is not NO_DEFAULT + } or None + + elif isinstance(obj, collections.abc.Mapping): + result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else + next((v for k, v in obj.items() if casefold(k) == key), None)) + + elif isinstance(obj, re.Match): + if isinstance(key, int) or casesense: + with contextlib.suppress(IndexError): + result = obj.group(key) + + elif isinstance(key, str): + result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) + + elif isinstance(key, (int, slice)): + if is_iterable_like(obj, collections.abc.Sequence): + branching = isinstance(key, slice) + with contextlib.suppress(IndexError): + result = obj[key] + elif traverse_string: + with contextlib.suppress(IndexError): + result = str(obj)[key] + + return branching, result if branching else (result,) + + def lazy_last(iterable): + iterator = iter(iterable) + prev = next(iterator, NO_DEFAULT) + if prev is NO_DEFAULT: + return + + for item in iterator: + yield False, prev + prev = item + + yield True, prev + + def apply_path(start_obj, path, test_type): + objs = (start_obj,) + has_branched = False + + key = None + for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): + if is_user_input and isinstance(key, str): + if key == ':': + key = ... + elif ':' in key: + key = slice(*map(int_or_none, key.split(':'))) + elif int_or_none(key) is not None: + key = int(key) + + if not casesense and isinstance(key, str): + key = key.casefold() + + if __debug__ and callable(key): + # Verify function signature + inspect.signature(key).bind(None, None) + + new_objs = [] + for obj in objs: + branching, results = apply_key(key, obj, last) + has_branched |= branching + new_objs.append(results) + + objs = itertools.chain.from_iterable(new_objs) + + if test_type and not isinstance(key, (dict, list, tuple)): + objs = map(type_test, objs) + + return objs, has_branched, isinstance(key, dict) + + def _traverse_obj(obj, path, allow_empty, test_type): + results, has_branched, is_dict = apply_path(obj, path, test_type) + results = LazyList(item for item in results if item not in (None, {})) + if get_all and has_branched: + if results: + return results.exhaust() + if allow_empty: + return [] if default is NO_DEFAULT else default + return None + + return results[0] if results else {} if allow_empty and is_dict else None + + for index, path in enumerate(paths, 1): + result = _traverse_obj(obj, path, index == len(paths), True) + if result is not None: + return result + + return None if default is NO_DEFAULT else default + + +def get_first(obj, *paths, **kwargs): + return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) + + +def dict_get(d, key_or_keys, default=None, skip_false_values=True): + for val in map(d.get, variadic(key_or_keys)): + if val is not None and (val or not skip_false_values): + return val + return default diff --git a/plugin/yt-dlp/yt_dlp/version.py b/plugin/yt-dlp/yt_dlp/version.py index e435ab3..ca599e9 100644 --- a/plugin/yt-dlp/yt_dlp/version.py +++ b/plugin/yt-dlp/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.03.04' +__version__ = '2023.06.21' -RELEASE_GIT_HEAD = '392389b7df7b818f794b231f14dc396d4875fbad' +RELEASE_GIT_HEAD = '42f2d40b475db66486a4b4fe5b56751a640db5db' VARIANT = None