From 7d18fed8f1983fe6de4ddc810dfb2761ba5744ac Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 3 Mar 2025 00:10:01 +0100 Subject: [PATCH 01/81] [networking] Add `keep_header_casing` extension (#11652) Authored by: coletdjnz, Grub4K Co-authored-by: coletdjnz --- test/test_networking.py | 13 +++ test/test_utils.py | 23 +++-- test/test_websockets.py | 22 +++-- yt_dlp/networking/_requests.py | 8 +- yt_dlp/networking/_urllib.py | 8 +- yt_dlp/networking/_websockets.py | 8 +- yt_dlp/networking/common.py | 19 ++++ yt_dlp/networking/impersonate.py | 22 ++++- yt_dlp/utils/networking.py | 148 +++++++++++++++++++++++++++---- 9 files changed, 230 insertions(+), 41 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index d96624af18..63914bc4ba 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -720,6 +720,15 @@ def test_allproxy(self, handler): rh, Request( f'http://127.0.0.1:{self.http_port}/headers', proxies={'all': 'http://10.255.255.255'})).close() + @pytest.mark.skip_handlers_if(lambda _, handler: handler not in ['Urllib', 'CurlCFFI'], 'handler does not support keep_header_casing') + def test_keep_header_casing(self, handler): + with handler() as rh: + res = validate_and_send( + rh, Request( + f'http://127.0.0.1:{self.http_port}/headers', headers={'X-test-heaDer': 'test'}, extensions={'keep_header_casing': True})).read().decode() + + assert 'X-test-heaDer: test' in res + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) class TestClientCertificate: @@ -1289,6 +1298,7 @@ class HTTPSupportedRH(ValidationRH): ({'legacy_ssl': False}, False), ({'legacy_ssl': True}, False), ({'legacy_ssl': 'notabool'}, AssertionError), + ({'keep_header_casing': True}, UnsupportedRequest), ]), ('Requests', 'http', [ ({'cookiejar': 'notacookiejar'}, AssertionError), @@ -1299,6 +1309,9 @@ class HTTPSupportedRH(ValidationRH): ({'legacy_ssl': False}, False), ({'legacy_ssl': True}, False), ({'legacy_ssl': 'notabool'}, AssertionError), + ({'keep_header_casing': False}, False), + ({'keep_header_casing': True}, False), + ({'keep_header_casing': 'notabool'}, AssertionError), ]), ('CurlCFFI', 'http', [ ({'cookiejar': 'notacookiejar'}, AssertionError), diff --git a/test/test_utils.py b/test/test_utils.py index 8f81d0b1b7..65f28db363 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,19 +3,20 @@ # Allow direct execution import os import sys -import unittest -import unittest.mock -import warnings -import datetime as dt sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import contextlib +import datetime as dt import io import itertools import json +import pickle import subprocess +import unittest +import unittest.mock +import warnings import xml.etree.ElementTree from yt_dlp.compat import ( @@ -2087,21 +2088,26 @@ def test_http_header_dict(self): headers = HTTPHeaderDict() headers['ytdl-test'] = b'0' self.assertEqual(list(headers.items()), [('Ytdl-Test', '0')]) + self.assertEqual(list(headers.sensitive().items()), [('ytdl-test', '0')]) headers['ytdl-test'] = 1 self.assertEqual(list(headers.items()), [('Ytdl-Test', '1')]) + self.assertEqual(list(headers.sensitive().items()), [('ytdl-test', '1')]) headers['Ytdl-test'] = '2' self.assertEqual(list(headers.items()), [('Ytdl-Test', '2')]) + self.assertEqual(list(headers.sensitive().items()), [('Ytdl-test', '2')]) self.assertTrue('ytDl-Test' in headers) self.assertEqual(str(headers), str(dict(headers))) self.assertEqual(repr(headers), str(dict(headers))) headers.update({'X-dlp': 'data'}) self.assertEqual(set(headers.items()), {('Ytdl-Test', '2'), ('X-Dlp', 'data')}) + self.assertEqual(set(headers.sensitive().items()), {('Ytdl-test', '2'), ('X-dlp', 'data')}) self.assertEqual(dict(headers), {'Ytdl-Test': '2', 'X-Dlp': 'data'}) self.assertEqual(len(headers), 2) self.assertEqual(headers.copy(), headers) - headers2 = HTTPHeaderDict({'X-dlp': 'data3'}, **headers, **{'X-dlp': 'data2'}) + headers2 = HTTPHeaderDict({'X-dlp': 'data3'}, headers, **{'X-dlP': 'data2'}) self.assertEqual(set(headers2.items()), {('Ytdl-Test', '2'), ('X-Dlp', 'data2')}) + self.assertEqual(set(headers2.sensitive().items()), {('Ytdl-test', '2'), ('X-dlP', 'data2')}) self.assertEqual(len(headers2), 2) headers2.clear() self.assertEqual(len(headers2), 0) @@ -2109,16 +2115,23 @@ def test_http_header_dict(self): # ensure we prefer latter headers headers3 = HTTPHeaderDict({'Ytdl-TeSt': 1}, {'Ytdl-test': 2}) self.assertEqual(set(headers3.items()), {('Ytdl-Test', '2')}) + self.assertEqual(set(headers3.sensitive().items()), {('Ytdl-test', '2')}) del headers3['ytdl-tesT'] self.assertEqual(dict(headers3), {}) headers4 = HTTPHeaderDict({'ytdl-test': 'data;'}) self.assertEqual(set(headers4.items()), {('Ytdl-Test', 'data;')}) + self.assertEqual(set(headers4.sensitive().items()), {('ytdl-test', 'data;')}) # common mistake: strip whitespace from values # https://github.com/yt-dlp/yt-dlp/issues/8729 headers5 = HTTPHeaderDict({'ytdl-test': ' data; '}) self.assertEqual(set(headers5.items()), {('Ytdl-Test', 'data;')}) + self.assertEqual(set(headers5.sensitive().items()), {('ytdl-test', 'data;')}) + + # test if picklable + headers6 = HTTPHeaderDict(a=1, b=2) + self.assertEqual(pickle.loads(pickle.dumps(headers6)), headers6) def test_extract_basic_auth(self): assert extract_basic_auth('http://:foo.bar') == ('http://:foo.bar', None) diff --git a/test/test_websockets.py b/test/test_websockets.py index 06112cc0b8..dead5fe5c5 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -44,7 +44,7 @@ def websocket_handler(websocket): return websocket.send('2') elif isinstance(message, str): if message == 'headers': - return websocket.send(json.dumps(dict(websocket.request.headers))) + return websocket.send(json.dumps(dict(websocket.request.headers.raw_items()))) elif message == 'path': return websocket.send(websocket.request.path) elif message == 'source_address': @@ -266,18 +266,18 @@ def test_cookies(self, handler): with handler(cookiejar=cookiejar) as rh: ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') - assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' + assert HTTPHeaderDict(json.loads(ws.recv()))['cookie'] == 'test=ytdlp' ws.close() with handler() as rh: ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') - assert 'cookie' not in json.loads(ws.recv()) + assert 'cookie' not in HTTPHeaderDict(json.loads(ws.recv())) ws.close() ws = ws_validate_and_send(rh, Request(self.ws_base_url, extensions={'cookiejar': cookiejar})) ws.send('headers') - assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' + assert HTTPHeaderDict(json.loads(ws.recv()))['cookie'] == 'test=ytdlp' ws.close() @pytest.mark.skip_handler('Websockets', 'Set-Cookie not supported by websockets') @@ -287,7 +287,7 @@ def test_cookie_sync_only_cookiejar(self, handler): ws_validate_and_send(rh, Request(f'{self.ws_base_url}/get_cookie', extensions={'cookiejar': YoutubeDLCookieJar()})) ws = ws_validate_and_send(rh, Request(self.ws_base_url, extensions={'cookiejar': YoutubeDLCookieJar()})) ws.send('headers') - assert 'cookie' not in json.loads(ws.recv()) + assert 'cookie' not in HTTPHeaderDict(json.loads(ws.recv())) ws.close() @pytest.mark.skip_handler('Websockets', 'Set-Cookie not supported by websockets') @@ -298,12 +298,12 @@ def test_cookie_sync_delete_cookie(self, handler): ws_validate_and_send(rh, Request(f'{self.ws_base_url}/get_cookie')) ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') - assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' + assert HTTPHeaderDict(json.loads(ws.recv()))['cookie'] == 'test=ytdlp' ws.close() cookiejar.clear_session_cookies() ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') - assert 'cookie' not in json.loads(ws.recv()) + assert 'cookie' not in HTTPHeaderDict(json.loads(ws.recv())) ws.close() def test_source_address(self, handler): @@ -341,6 +341,14 @@ def test_request_headers(self, handler): assert headers['test3'] == 'test3' ws.close() + def test_keep_header_casing(self, handler): + with handler(headers=HTTPHeaderDict({'x-TeSt1': 'test'})) as rh: + ws = ws_validate_and_send(rh, Request(self.ws_base_url, headers={'x-TeSt2': 'test'}, extensions={'keep_header_casing': True})) + ws.send('headers') + headers = json.loads(ws.recv()) + assert 'x-TeSt1' in headers + assert 'x-TeSt2' in headers + @pytest.mark.parametrize('client_cert', ( {'client_certificate': os.path.join(MTLS_CERT_DIR, 'clientwithkey.crt')}, { diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index 7de95ab3bf..23775845d6 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -296,6 +296,7 @@ def _check_extensions(self, extensions): extensions.pop('cookiejar', None) extensions.pop('timeout', None) extensions.pop('legacy_ssl', None) + extensions.pop('keep_header_casing', None) def _create_instance(self, cookiejar, legacy_ssl_support=None): session = RequestsSession() @@ -312,11 +313,12 @@ def _create_instance(self, cookiejar, legacy_ssl_support=None): session.trust_env = False # no need, we already load proxies from env return session - def _send(self, request): - - headers = self._merge_headers(request.headers) + def _prepare_headers(self, _, headers): add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) + def _send(self, request): + + headers = self._get_headers(request) max_redirects_exceeded = False session = self._get_instance( diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 510bb2a691..a188b35f57 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -379,13 +379,15 @@ def _create_instance(self, proxies, cookiejar, legacy_ssl_support=None): opener.addheaders = [] return opener - def _send(self, request): - headers = self._merge_headers(request.headers) + def _prepare_headers(self, _, headers): add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) + + def _send(self, request): + headers = self._get_headers(request) urllib_req = urllib.request.Request( url=request.url, data=request.data, - headers=dict(headers), + headers=headers, method=request.method, ) diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index ec55567dae..7e5ab46004 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -116,6 +116,7 @@ def _check_extensions(self, extensions): extensions.pop('timeout', None) extensions.pop('cookiejar', None) extensions.pop('legacy_ssl', None) + extensions.pop('keep_header_casing', None) def close(self): # Remove the logging handler that contains a reference to our logger @@ -123,15 +124,16 @@ def close(self): for name, handler in self.__logging_handlers.items(): logging.getLogger(name).removeHandler(handler) - def _send(self, request): - timeout = self._calculate_timeout(request) - headers = self._merge_headers(request.headers) + def _prepare_headers(self, request, headers): if 'cookie' not in headers: cookiejar = self._get_cookiejar(request) cookie_header = cookiejar.get_cookie_header(request.url) if cookie_header: headers['cookie'] = cookie_header + def _send(self, request): + timeout = self._calculate_timeout(request) + headers = self._get_headers(request) wsuri = parse_uri(request.url) create_conn_kwargs = { 'source_address': (self.source_address, 0) if self.source_address else None, diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index e8951c7e7d..ddceaa9a97 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -206,6 +206,7 @@ class RequestHandler(abc.ABC): - `cookiejar`: Cookiejar to use for this request. - `timeout`: socket timeout to use for this request. - `legacy_ssl`: Enable legacy SSL options for this request. See legacy_ssl_support. + - `keep_header_casing`: Keep the casing of headers when sending the request. To enable these, add extensions.pop('', None) to _check_extensions Apart from the url protocol, proxies dict may contain the following keys: @@ -259,6 +260,23 @@ def _make_sslcontext(self, legacy_ssl_support=None): def _merge_headers(self, request_headers): return HTTPHeaderDict(self.headers, request_headers) + def _prepare_headers(self, request: Request, headers: HTTPHeaderDict) -> None: # noqa: B027 + """Additional operations to prepare headers before building. To be extended by subclasses. + @param request: Request object + @param headers: Merged headers to prepare + """ + + def _get_headers(self, request: Request) -> dict[str, str]: + """ + Get headers for external use. + Subclasses may define a _prepare_headers method to modify headers after merge but before building. + """ + headers = self._merge_headers(request.headers) + self._prepare_headers(request, headers) + if request.extensions.get('keep_header_casing'): + return headers.sensitive() + return dict(headers) + def _calculate_timeout(self, request): return float(request.extensions.get('timeout') or self.timeout) @@ -317,6 +335,7 @@ def _check_extensions(self, extensions): assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType)) assert isinstance(extensions.get('timeout'), (float, int, NoneType)) assert isinstance(extensions.get('legacy_ssl'), (bool, NoneType)) + assert isinstance(extensions.get('keep_header_casing'), (bool, NoneType)) def _validate(self, request): self._check_url_scheme(request) diff --git a/yt_dlp/networking/impersonate.py b/yt_dlp/networking/impersonate.py index 0626b3b491..b90d10b760 100644 --- a/yt_dlp/networking/impersonate.py +++ b/yt_dlp/networking/impersonate.py @@ -5,11 +5,11 @@ from dataclasses import dataclass from typing import Any -from .common import RequestHandler, register_preference +from .common import RequestHandler, register_preference, Request from .exceptions import UnsupportedRequest from ..compat.types import NoneType from ..utils import classproperty, join_nonempty -from ..utils.networking import std_headers +from ..utils.networking import std_headers, HTTPHeaderDict @dataclass(order=True, frozen=True) @@ -123,7 +123,17 @@ def _get_request_target(self, request): """Get the requested target for the request""" return self._resolve_target(request.extensions.get('impersonate') or self.impersonate) - def _get_impersonate_headers(self, request): + def _prepare_impersonate_headers(self, request: Request, headers: HTTPHeaderDict) -> None: # noqa: B027 + """Additional operations to prepare headers before building. To be extended by subclasses. + @param request: Request object + @param headers: Merged headers to prepare + """ + + def _get_impersonate_headers(self, request: Request) -> dict[str, str]: + """ + Get headers for external impersonation use. + Subclasses may define a _prepare_impersonate_headers method to modify headers after merge but before building. + """ headers = self._merge_headers(request.headers) if self._get_request_target(request) is not None: # remove all headers present in std_headers @@ -131,7 +141,11 @@ def _get_impersonate_headers(self, request): for k, v in std_headers.items(): if headers.get(k) == v: headers.pop(k) - return headers + + self._prepare_impersonate_headers(request, headers) + if request.extensions.get('keep_header_casing'): + return headers.sensitive() + return dict(headers) @register_preference(ImpersonateRequestHandler) diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index 933b164be9..542abace87 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -1,9 +1,16 @@ +from __future__ import annotations + import collections +import collections.abc import random +import typing import urllib.parse import urllib.request -from ._utils import remove_start +if typing.TYPE_CHECKING: + T = typing.TypeVar('T') + +from ._utils import NO_DEFAULT, remove_start def random_user_agent(): @@ -51,32 +58,141 @@ def random_user_agent(): return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) -class HTTPHeaderDict(collections.UserDict, dict): +class HTTPHeaderDict(dict): """ Store and access keys case-insensitively. The constructor can take multiple dicts, in which keys in the latter are prioritised. + + Retains a case sensitive mapping of the headers, which can be accessed via `.sensitive()`. """ + def __new__(cls, *args: typing.Any, **kwargs: typing.Any) -> typing.Self: + obj = dict.__new__(cls, *args, **kwargs) + obj.__sensitive_map = {} + return obj - def __init__(self, *args, **kwargs): + def __init__(self, /, *args, **kwargs): super().__init__() - for dct in args: - if dct is not None: - self.update(dct) - self.update(kwargs) + self.__sensitive_map = {} - def __setitem__(self, key, value): - if isinstance(value, bytes): - value = value.decode('latin-1') - super().__setitem__(key.title(), str(value).strip()) + for dct in filter(None, args): + self.update(dct) + if kwargs: + self.update(kwargs) - def __getitem__(self, key): + def sensitive(self, /) -> dict[str, str]: + return { + self.__sensitive_map[key]: value + for key, value in self.items() + } + + def __contains__(self, key: str, /) -> bool: + return super().__contains__(key.title() if isinstance(key, str) else key) + + def __delitem__(self, key: str, /) -> None: + key = key.title() + del self.__sensitive_map[key] + super().__delitem__(key) + + def __getitem__(self, key, /) -> str: return super().__getitem__(key.title()) - def __delitem__(self, key): - super().__delitem__(key.title()) + def __ior__(self, other, /): + if isinstance(other, type(self)): + other = other.sensitive() + if isinstance(other, dict): + self.update(other) + return + return NotImplemented - def __contains__(self, key): - return super().__contains__(key.title() if isinstance(key, str) else key) + def __or__(self, other, /) -> typing.Self: + if isinstance(other, type(self)): + other = other.sensitive() + if isinstance(other, dict): + return type(self)(self.sensitive(), other) + return NotImplemented + + def __ror__(self, other, /) -> typing.Self: + if isinstance(other, type(self)): + other = other.sensitive() + if isinstance(other, dict): + return type(self)(other, self.sensitive()) + return NotImplemented + + def __setitem__(self, key: str, value, /) -> None: + if isinstance(value, bytes): + value = value.decode('latin-1') + key_title = key.title() + self.__sensitive_map[key_title] = key + super().__setitem__(key_title, str(value).strip()) + + def clear(self, /) -> None: + self.__sensitive_map.clear() + super().clear() + + def copy(self, /) -> typing.Self: + return type(self)(self.sensitive()) + + @typing.overload + def get(self, key: str, /) -> str | None: ... + + @typing.overload + def get(self, key: str, /, default: T) -> str | T: ... + + def get(self, key, /, default=NO_DEFAULT): + key = key.title() + if default is NO_DEFAULT: + return super().get(key) + return super().get(key, default) + + @typing.overload + def pop(self, key: str, /) -> str: ... + + @typing.overload + def pop(self, key: str, /, default: T) -> str | T: ... + + def pop(self, key, /, default=NO_DEFAULT): + key = key.title() + if default is NO_DEFAULT: + self.__sensitive_map.pop(key) + return super().pop(key) + self.__sensitive_map.pop(key, default) + return super().pop(key, default) + + def popitem(self) -> tuple[str, str]: + self.__sensitive_map.popitem() + return super().popitem() + + @typing.overload + def setdefault(self, key: str, /) -> str: ... + + @typing.overload + def setdefault(self, key: str, /, default) -> str: ... + + def setdefault(self, key, /, default=None) -> str: + key = key.title() + if key in self.__sensitive_map: + return super().__getitem__(key) + + self[key] = default or '' + return self[key] + + def update(self, other, /, **kwargs) -> None: + if isinstance(other, type(self)): + other = other.sensitive() + if isinstance(other, collections.abc.Mapping): + for key, value in other.items(): + self[key] = value + + elif hasattr(other, 'keys'): + for key in other.keys(): # noqa: SIM118 + self[key] = other[key] + + else: + for key, value in other: + self[key] = value + + for key, value in kwargs.items(): + self[key] = value std_headers = HTTPHeaderDict({ From 172d5fcd778bf2605db7647ebc56b29ed18d24ac Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 3 Mar 2025 22:55:03 +0100 Subject: [PATCH 02/81] [ie/MagellanTV] Fix extractor (#12505) Closes #12498 Authored by: seproDev --- yt_dlp/extractor/magellantv.py | 46 +++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/yt_dlp/extractor/magellantv.py b/yt_dlp/extractor/magellantv.py index 6f2524ba22..e7ae709cfc 100644 --- a/yt_dlp/extractor/magellantv.py +++ b/yt_dlp/extractor/magellantv.py @@ -1,35 +1,36 @@ from .common import InfoExtractor -from ..utils import parse_age_limit, parse_duration, traverse_obj +from ..utils import parse_age_limit, parse_duration, url_or_none +from ..utils.traversal import traverse_obj class MagellanTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?magellantv\.com/(?:watch|video)/(?P[\w-]+)' _TESTS = [{ - 'url': 'https://www.magellantv.com/watch/my-dads-on-death-row?type=v', + 'url': 'https://www.magellantv.com/watch/incas-the-new-story?type=v', 'info_dict': { - 'id': 'my-dads-on-death-row', + 'id': 'incas-the-new-story', 'ext': 'mp4', - 'title': 'My Dad\'s On Death Row', - 'description': 'md5:33ba23b9f0651fc4537ed19b1d5b0d7a', - 'duration': 3780.0, + 'title': 'Incas: The New Story', + 'description': 'md5:936c7f6d711c02dfb9db22a067b586fe', 'age_limit': 14, - 'tags': ['Justice', 'Reality', 'United States', 'True Crime'], + 'duration': 3060.0, + 'tags': ['Ancient History', 'Archaeology', 'Anthropology'], }, 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://www.magellantv.com/video/james-bulger-the-new-revelations', + 'url': 'https://www.magellantv.com/video/tortured-to-death-murdering-the-nanny', 'info_dict': { - 'id': 'james-bulger-the-new-revelations', + 'id': 'tortured-to-death-murdering-the-nanny', 'ext': 'mp4', - 'title': 'James Bulger: The New Revelations', - 'description': 'md5:7b97922038bad1d0fe8d0470d8a189f2', + 'title': 'Tortured to Death: Murdering the Nanny', + 'description': 'md5:d87033594fa218af2b1a8b49f52511e5', + 'age_limit': 14, 'duration': 2640.0, - 'age_limit': 0, - 'tags': ['Investigation', 'True Crime', 'Justice', 'Europe'], + 'tags': ['True Crime', 'Murder'], }, 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://www.magellantv.com/watch/celebration-nation', + 'url': 'https://www.magellantv.com/watch/celebration-nation?type=s', 'info_dict': { 'id': 'celebration-nation', 'ext': 'mp4', @@ -43,10 +44,19 @@ class MagellanTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = traverse_obj(self._search_nextjs_data(webpage, video_id), ( - 'props', 'pageProps', 'reactContext', - (('video', 'detail'), ('series', 'currentEpisode')), {dict}), get_all=False) - formats, subtitles = self._extract_m3u8_formats_and_subtitles(data['jwpVideoUrl'], video_id) + context = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['reactContext'] + data = traverse_obj(context, ((('video', 'detail'), ('series', 'currentEpisode')), {dict}, any)) + + formats, subtitles = [], {} + for m3u8_url in set(traverse_obj(data, ((('manifests', ..., 'hls'), 'jwp_video_url'), {url_or_none}))): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if not formats and (error := traverse_obj(context, ('errorDetailPage', 'errorMessage', {str}))): + if 'available in your country' in error: + self.raise_geo_restricted(msg=error) + self.raise_no_formats(f'{self.IE_NAME} said: {error}', expected=True) return { 'id': video_id, From 42b7440963866e31ff84a5b89030d1c596fa2e6e Mon Sep 17 00:00:00 2001 From: fries1234 Date: Mon, 3 Mar 2025 14:25:30 -0800 Subject: [PATCH 03/81] [ie/tvw] Add extractor (#12271) Authored by: fries1234 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/tvw.py | 117 ++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 yt_dlp/extractor/tvw.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 403e1f1f65..3ab0f5efa2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2224,6 +2224,7 @@ TVPlayIE, ) from .tvplayer import TVPlayerIE +from .tvw import TvwIE from .tweakers import TweakersIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py new file mode 100644 index 0000000000..1c060cd7a0 --- /dev/null +++ b/yt_dlp/extractor/tvw.py @@ -0,0 +1,117 @@ +import json + +from .common import InfoExtractor +from ..utils import clean_html, remove_end, unified_timestamp, url_or_none +from ..utils.traversal import traverse_obj + + +class TvwIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P[^/?#]+)' + + _TESTS = [{ + 'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/', + 'md5': '9ceb94fe2bb7fd726f74f16356825703', + 'info_dict': { + 'id': '2024011211', + 'ext': 'mp4', + 'title': 'Billy Frank Jr. Statue Maquette Unveiling Ceremony', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'md5:58a8150017d985b4f377e11ee8f6f36e', + 'timestamp': 1704902400, + 'upload_date': '20240110', + 'location': 'Legislative Building', + 'display_id': 'billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211', + 'categories': ['General Interest'], + }, + }, { + 'url': 'https://tvw.org/video/ebeys-landing-state-park-2024081007/', + 'md5': '71e87dae3deafd65d75ff3137b9a32fc', + 'info_dict': { + 'id': '2024081007', + 'ext': 'mp4', + 'title': 'Ebey\'s Landing State Park', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'md5:50c5bd73bde32fa6286a008dbc853386', + 'timestamp': 1724310900, + 'upload_date': '20240822', + 'location': 'Ebey’s Landing State Park', + 'display_id': 'ebeys-landing-state-park-2024081007', + 'categories': ['Washington State Parks'], + }, + }, { + 'url': 'https://tvw.org/video/home-warranties-workgroup-2', + 'md5': 'f678789bf94d07da89809f213cf37150', + 'info_dict': { + 'id': '1999121000', + 'ext': 'mp4', + 'title': 'Home Warranties Workgroup', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'md5:861396cc523c9641d0dce690bc5c35f3', + 'timestamp': 946389600, + 'upload_date': '19991228', + 'display_id': 'home-warranties-workgroup-2', + 'categories': ['Legislative'], + }, + }, { + 'url': 'https://tvw.org/video/washington-to-washington-a-new-space-race-2022041111/?eventID=2022041111', + 'md5': '6f5551090b351aba10c0d08a881b4f30', + 'info_dict': { + 'id': '2022041111', + 'ext': 'mp4', + 'title': 'Washington to Washington - A New Space Race', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'md5:f65a24eec56107afbcebb3aa5cd26341', + 'timestamp': 1650394800, + 'upload_date': '20220419', + 'location': 'Hayner Media Center', + 'display_id': 'washington-to-washington-a-new-space-race-2022041111', + 'categories': ['Washington to Washington', 'General Interest'], + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + client_id = self._html_search_meta('clientID', webpage, fatal=True) + video_id = self._html_search_meta('eventID', webpage, fatal=True) + + video_data = self._download_json( + 'https://api.v3.invintus.com/v2/Event/getDetailed', video_id, + headers={ + 'authorization': 'embedder', + 'wsc-api-key': '7WhiEBzijpritypp8bqcU7pfU9uicDR', + }, + data=json.dumps({ + 'clientID': client_id, + 'eventID': video_id, + 'showStreams': True, + }).encode())['data'] + + formats = [] + subtitles = {} + for stream_url in traverse_obj(video_data, ('streamingURIs', ..., {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if caption_url := traverse_obj(video_data, ('captionPath', {url_or_none})): + subtitles.setdefault('en', []).append({'url': caption_url, 'ext': 'vtt'}) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + 'description': self._og_search_description(webpage, default=None), + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': ('description', {clean_html}), + 'categories': ('categories', ..., {str}), + 'thumbnail': ('videoThumbnail', {url_or_none}), + 'timestamp': ('startDateTime', {unified_timestamp}), + 'location': ('locationName', {str}), + 'is_live': ('eventStatus', {lambda x: x == 'live'}), + }), + } From 8eb9c1bf3b9908cca22ef043602aa24fb9f352c6 Mon Sep 17 00:00:00 2001 From: sepro Date: Tue, 4 Mar 2025 00:46:18 +0100 Subject: [PATCH 04/81] [ie/RTP] Rework extractor (#11638) Closes #4661, Closes #10393, Closes #11244 Authored by: seproDev, vallovic, red-acid, pferreir, somini Co-authored-by: vallovic Co-authored-by: red-acid <161967284+red-acid@users.noreply.github.com> Co-authored-by: Pedro Ferreira Co-authored-by: somini --- yt_dlp/extractor/rtp.py | 226 +++++++++++++++++++++++++++------------- 1 file changed, 154 insertions(+), 72 deletions(-) diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py index 26aec2e4cc..03e9859403 100644 --- a/yt_dlp/extractor/rtp.py +++ b/yt_dlp/extractor/rtp.py @@ -3,12 +3,20 @@ import re import urllib.parse -from .common import InfoExtractor -from ..utils import js_to_json +from .common import InfoExtractor, Request +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + parse_duration, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj class RTPIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P[0-9]+)/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:[^/#?]+/)?p(?P\d+)/(?Pe\d+)' _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', @@ -16,99 +24,173 @@ class RTPIE(InfoExtractor): 'id': 'e174042', 'ext': 'mp3', 'title': 'Paixões Cruzadas', - 'description': 'As paixões musicais de António Cartaxo e António Macedo', + 'description': 'md5:af979e58ba0ab73f78435fc943fdb070', 'thumbnail': r're:^https?://.*\.jpg', + 'series': 'Paixões Cruzadas', + 'duration': 2950.0, + 'modified_timestamp': 1553693464, + 'modified_date': '20190327', + 'timestamp': 1417219200, + 'upload_date': '20141129', }, }, { 'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril', - 'md5': '9a81ed53f2b2197cfa7ed455b12f8ade', + 'md5': '5b4859940e3adef61247a77dfb76046a', 'info_dict': { 'id': 'e757904', 'ext': 'mp4', - 'title': '25 Curiosidades, 25 de Abril', - 'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr', + 'title': 'Estudar ou não estudar', + 'description': 'md5:3bfd7eb8bebfd5711a08df69c9c14c35', 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1711958401, + 'duration': 146.0, + 'upload_date': '20240401', + 'modified_timestamp': 1712242991, + 'series': '25 Curiosidades, 25 de Abril', + 'episode_number': 2, + 'episode': 'Estudar ou não estudar', + 'modified_date': '20240404', }, }, { - 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', - 'only_matching': True, - }, { - 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano', - 'only_matching': True, - }, { - 'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon', - 'only_matching': True, + # Episode not accessible through API + 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/e500050/portugues-1-ano', + 'md5': '57660c0b46db9f22118c52cbd65975e4', + 'info_dict': { + 'id': 'e500050', + 'ext': 'mp4', + 'title': 'Português - 1.º ano', + 'duration': 1669.0, + 'description': 'md5:be68925c81269f8c6886589f25fe83ea', + 'upload_date': '20201020', + 'timestamp': 1603180799, + 'thumbnail': 'https://cdn-images.rtp.pt/EPG/imagens/39482_59449_64850.png?v=3&w=860', + }, }] + _USER_AGENT = 'rtpplay/2.0.66 (pt.rtp.rtpplay; build:2066; iOS 15.8.3) Alamofire/5.9.1' + _AUTH_TOKEN = None + + def _fetch_auth_token(self): + if self._AUTH_TOKEN: + return self._AUTH_TOKEN + self._AUTH_TOKEN = traverse_obj(self._download_json(Request( + 'https://rtpplayapi.rtp.pt/play/api/2/token-manager', + headers={ + 'Accept': '*/*', + 'rtp-play-auth': 'RTPPLAY_MOBILE_IOS', + 'rtp-play-auth-hash': 'fac9c328b2f27e26e03d7f8942d66c05b3e59371e16c2a079f5c83cc801bd3ee', + 'rtp-play-auth-timestamp': '2145973229682', + 'User-Agent': self._USER_AGENT, + }, extensions={'keep_header_casing': True}), None, + note='Fetching guest auth token', errnote='Could not fetch guest auth token', + fatal=False), ('token', 'token', {str})) + return self._AUTH_TOKEN + + @staticmethod + def _cleanup_media_url(url): + if urllib.parse.urlparse(url).netloc == 'streaming-ondemand.rtp.pt': + return None + return url.replace('/drm-fps/', '/hls/').replace('/drm-dash/', '/dash/') + + def _extract_formats(self, media_urls, episode_id): + formats = [] + subtitles = {} + for media_url in set(traverse_obj(media_urls, (..., {url_or_none}, {self._cleanup_media_url}))): + ext = determine_ext(media_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, episode_id, m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + media_url, episode_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_url, + 'format_id': 'http', + }) + return formats, subtitles + + def _extract_from_api(self, program_id, episode_id): + auth_token = self._fetch_auth_token() + if not auth_token: + return + episode_data = traverse_obj(self._download_json( + f'https://www.rtp.pt/play/api/1/get-episode/{program_id}/{episode_id[1:]}', episode_id, + query={'include_assets': 'true', 'include_webparams': 'true'}, + headers={ + 'Accept': '*/*', + 'Authorization': f'Bearer {auth_token}', + 'User-Agent': self._USER_AGENT, + }, fatal=False), 'result', {dict}) + if not episode_data: + return + asset_urls = traverse_obj(episode_data, ('assets', 0, 'asset_url', {dict})) + media_urls = traverse_obj(asset_urls, ( + ((('hls', 'dash'), 'stream_url'), ('multibitrate', ('url_hls', 'url_dash'))),)) + formats, subtitles = self._extract_formats(media_urls, episode_id) + + for sub_data in traverse_obj(asset_urls, ('subtitles', 'vtt_list', lambda _, v: url_or_none(v['file']))): + subtitles.setdefault(sub_data.get('code') or 'pt', []).append({ + 'url': sub_data['file'], + 'name': sub_data.get('language'), + }) + + return { + 'id': episode_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': traverse_obj(episode_data, ('assets', 0, 'asset_thumbnail', {url_or_none})), + **traverse_obj(episode_data, ('episode', { + 'title': (('episode_title', 'program_title'), {str}, filter, any), + 'alt_title': ('episode_subtitle', {str}, filter), + 'description': (('episode_description', 'episode_summary'), {str}, filter, any), + 'timestamp': ('episode_air_date', {parse_iso8601(delimiter=' ')}), + 'modified_timestamp': ('episode_lastchanged', {parse_iso8601(delimiter=' ')}), + 'duration': ('episode_duration_complete', {parse_duration}), + 'episode': ('episode_title', {str}, filter), + 'episode_number': ('episode_number', {int_or_none}), + 'season': ('program_season', {str}, filter), + 'series': ('program_title', {str}, filter), + })), + } + _RX_OBFUSCATION = re.compile(r'''(?xs) atob\s*\(\s*decodeURIComponent\s*\(\s* (\[[0-9A-Za-z%,'"]*\]) \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\) ''') - def __unobfuscate(self, data, *, video_id): - if data.startswith('{'): - data = self._RX_OBFUSCATION.sub( - lambda m: json.dumps( - base64.b64decode(urllib.parse.unquote( - ''.join(self._parse_json(m.group(1), video_id)), - )).decode('iso-8859-1')), - data) - return js_to_json(data) + def __unobfuscate(self, data): + return self._RX_OBFUSCATION.sub( + lambda m: json.dumps( + base64.b64decode(urllib.parse.unquote( + ''.join(json.loads(m.group(1))), + )).decode('iso-8859-1')), + data) - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._html_search_meta( - 'twitter:title', webpage, display_name='title', fatal=True) - - f, config = self._search_regex( - r'''(?sx) - (?:var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s*)? - var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P{(?:(?!\*/).)+?})\);(?!\s*\*/) - ''', webpage, - 'player config', group=('f', 'config')) - - config = self._parse_json( - config, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) - f = config['file'] if not f else self._parse_json( - f, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) + def _extract_from_html(self, url, episode_id): + webpage = self._download_webpage(url, episode_id) formats = [] - if isinstance(f, dict): - f_hls = f.get('hls') - if f_hls is not None: - formats.extend(self._extract_m3u8_formats( - f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')) - - f_dash = f.get('dash') - if f_dash is not None: - formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash')) - else: - formats.append({ - 'format_id': 'f', - 'url': f, - 'vcodec': 'none' if config.get('mediaType') == 'audio' else None, - }) - subtitles = {} - - vtt = config.get('vtt') - if vtt is not None: - for lcode, lname, url in vtt: - subtitles.setdefault(lcode, []).append({ - 'name': lname, - 'url': url, - }) + media_urls = traverse_obj(re.findall(r'(?:var\s+f\s*=|RTPPlayer\({[^}]+file:)\s*({[^}]+}|"[^"]+")', webpage), ( + -1, (({self.__unobfuscate}, {js_to_json}, {json.loads}, {dict.values}, ...), {json.loads}))) + formats, subtitles = self._extract_formats(media_urls, episode_id) return { - 'id': video_id, - 'title': title, + 'id': episode_id, 'formats': formats, - 'description': self._html_search_meta(['description', 'twitter:description'], webpage), - 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), 'subtitles': subtitles, + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage, default=None), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None), + **self._search_json_ld(webpage, episode_id, default={}), + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None), } + + def _real_extract(self, url): + program_id, episode_id = self._match_valid_url(url).group('program_id', 'id') + return self._extract_from_api(program_id, episode_id) or self._extract_from_html(url, episode_id) From 9d70abe4de401175cbbaaa36017806f16b2df9af Mon Sep 17 00:00:00 2001 From: u-spec-png <54671367+u-spec-png@users.noreply.github.com> Date: Tue, 4 Mar 2025 01:51:23 +0100 Subject: [PATCH 05/81] [ie/N1] Fix extraction of newer articles (#12514) Authored by: u-spec-png --- yt_dlp/extractor/n1.py | 56 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py index bbb327e750..e0e49161bd 100644 --- a/yt_dlp/extractor/n1.py +++ b/yt_dlp/extractor/n1.py @@ -4,7 +4,9 @@ from ..utils import ( extract_attributes, unified_timestamp, + url_or_none, ) +from ..utils.traversal import traverse_obj class N1InfoAssetIE(InfoExtractor): @@ -35,9 +37,9 @@ class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' _VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P[^/?#]+)' _TESTS = [{ - # Youtube embedded + # YouTube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', - 'md5': '01ddb6646d0fd9c4c7d990aa77fe1c5a', + 'md5': '987ce6fd72acfecc453281e066b87973', 'info_dict': { 'id': 'L5Hd4hQVUpk', 'ext': 'mp4', @@ -45,7 +47,26 @@ class N1InfoIIE(InfoExtractor): 'title': 'Ozmo i USO21, ep. 13: Novak Đoković – Danil Medvedev | Ključevi Poraza, Budućnost | SPORT KLUB TENIS', 'description': 'md5:467f330af1effedd2e290f10dc31bb8e', 'uploader': 'Sport Klub', - 'uploader_id': 'sportklub', + 'uploader_id': '@sportklub', + 'uploader_url': 'https://www.youtube.com/@sportklub', + 'channel': 'Sport Klub', + 'channel_id': 'UChpzBje9Ro6CComXe3BgNaw', + 'channel_url': 'https://www.youtube.com/channel/UChpzBje9Ro6CComXe3BgNaw', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'duration': 1049, + 'thumbnail': 'https://i.ytimg.com/vi/L5Hd4hQVUpk/maxresdefault.jpg', + 'chapters': 'count:9', + 'categories': ['Sports'], + 'tags': 'count:10', + 'timestamp': 1631522787, + 'playable_in_embed': True, + 'availability': 'public', + 'live_status': 'not_live', }, }, { 'url': 'https://rs.n1info.com/vesti/djilas-los-plan-za-metro-nece-resiti-nijedan-saobracajni-problem/', @@ -55,6 +76,7 @@ class N1InfoIIE(InfoExtractor): 'title': 'Đilas: Predlog izgradnje metroa besmislen; SNS odbacuje navode', 'upload_date': '20210924', 'timestamp': 1632481347, + 'thumbnail': 'http://n1info.rs/wp-content/themes/ucnewsportal-n1/dist/assets/images/placeholder-image-video.jpg', }, 'params': { 'skip_download': True, @@ -67,6 +89,7 @@ class N1InfoIIE(InfoExtractor): 'title': 'Zadnji dnevi na kopališču Ilirija: “Ilirija ni umrla, ubili so jo”', 'timestamp': 1632567630, 'upload_date': '20210925', + 'thumbnail': 'https://n1info.si/wp-content/uploads/2021/09/06/1630945843-tomaz3.png', }, 'params': { 'skip_download': True, @@ -81,6 +104,14 @@ class N1InfoIIE(InfoExtractor): 'upload_date': '20210924', 'timestamp': 1632448649.0, 'uploader': 'YouLotWhatDontStop', + 'display_id': 'pu9wbx', + 'channel_id': 'serbia', + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'duration': 134, + 'thumbnail': 'https://external-preview.redd.it/5nmmawSeGx60miQM3Iq-ueC9oyCLTLjjqX-qqY8uRsc.png?format=pjpg&auto=webp&s=2f973400b04d23f871b608b178e47fc01f9b8f1d', }, 'params': { 'skip_download': True, @@ -93,6 +124,7 @@ class N1InfoIIE(InfoExtractor): 'title': 'Žaklina Tatalović Ani Brnabić: Pričate laži (VIDEO)', 'upload_date': '20211102', 'timestamp': 1635861677, + 'thumbnail': 'https://nova.rs/wp-content/uploads/2021/11/02/1635860298-TNJG_Ana_Brnabic_i_Zaklina_Tatalovic_100_dana_Vlade_GP.jpg', }, }, { 'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/', @@ -104,6 +136,16 @@ class N1InfoIIE(InfoExtractor): 'timestamp': 1687290536, 'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg', }, + }, { + 'url': 'https://n1info.rs/vesti/vuciceva-turneja-po-srbiji-najavljuje-kontrarevoluciju-preti-svom-narodu-vredja-novinare/', + 'info_dict': { + 'id': '2025974', + 'ext': 'mp4', + 'title': 'Vučićeva turneja po Srbiji: Najavljuje kontrarevoluciju, preti svom narodu, vređa novinare', + 'thumbnail': 'https://cdn-uc.brid.tv/live/partners/26827/snapshot/2025974_fhd_67c4a23280a81_1740939826.jpg', + 'timestamp': 1740939936, + 'upload_date': '20250302', + }, }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, @@ -115,11 +157,11 @@ def _real_extract(self, url): title = self._html_search_regex(r']+>(.+?)', webpage, 'title') timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) - plugin_data = self._html_search_meta('BridPlugin', webpage) + plugin_data = re.findall(r'\$bp\("(?:Brid|TargetVideo)_\d+",\s(.+)\);', webpage) entries = [] if plugin_data: site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id') - for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage): + for video_data in plugin_data: video_id = self._parse_json(video_data, title)['video'] entries.append({ 'id': video_id, @@ -140,7 +182,7 @@ def _real_extract(self, url): 'url': video_data.get('data-url'), 'id': video_data.get('id'), 'title': title, - 'thumbnail': video_data.get('data-thumbnail'), + 'thumbnail': traverse_obj(video_data, (('data-thumbnail', 'data-default_thumbnail'), {url_or_none}, any)), 'timestamp': timestamp, 'ie_key': 'N1InfoAsset', }) @@ -152,7 +194,7 @@ def _real_extract(self, url): if url.startswith('https://www.youtube.com'): entries.append(self.url_result(url, ie='Youtube')) elif url.startswith('https://www.redditmedia.com'): - entries.append(self.url_result(url, ie='RedditR')) + entries.append(self.url_result(url, ie='Reddit')) return { '_type': 'playlist', From b8b47547049f5ebc3dd680fc7de70ed0ca9c0d70 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 5 Mar 2025 00:22:52 -0600 Subject: [PATCH 06/81] [ie/twitter] Fix syndication token generation (#12537) Fix 14cd7f3443c6da4d49edaefcc12da9dee86e243e Authored by: bashonly --- yt_dlp/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index d32ae3f18f..f3695cc061 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1334,7 +1334,7 @@ def _build_graphql_query(self, media_id): def _generate_syndication_token(self, twid): # ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '') translation = str.maketrans(dict.fromkeys('0.')) - return js_number_to_string((int(twid) / 1e15) * math.PI, 36).translate(translation) + return js_number_to_string((int(twid) / 1e15) * math.pi, 36).translate(translation) def _call_syndication_api(self, twid): self.report_warning( From bd0a66816934de70312eea1e71c59c13b401dc3a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 5 Mar 2025 00:38:23 -0600 Subject: [PATCH 07/81] [ie/pinterest] Fix extractor (#12538) Closes #12529 Authored by: mikf Co-authored-by: =?UTF-8?q?Mike=20F=C3=A4hrmann?= --- yt_dlp/extractor/pinterest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index f0b38893b2..b2fe7494b3 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -23,9 +23,9 @@ class PinterestBaseIE(InfoExtractor): def _call_api(self, resource, video_id, options): return self._download_json( f'https://www.pinterest.com/resource/{resource}Resource/get/', - video_id, f'Download {resource} JSON metadata', query={ - 'data': json.dumps({'options': options}), - })['resource_response'] + video_id, f'Download {resource} JSON metadata', + query={'data': json.dumps({'options': options})}, + headers={'X-Pinterest-PWS-Handler': 'www/[username].js'})['resource_response'] def _extract_video(self, data, extract_formats=True): video_id = data['id'] From 05c8023a27dd37c49163c0498bf98e3e3c1cb4b9 Mon Sep 17 00:00:00 2001 From: sepro Date: Fri, 7 Mar 2025 22:14:38 +0100 Subject: [PATCH 08/81] [ie/vk] Improve metadata extraction (#12510) Closes #12509 Authored by: seproDev --- yt_dlp/extractor/vk.py | 51 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 4b36e41ffb..faf3e60b0b 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -116,6 +116,7 @@ class VKIE(VKBaseIE): 'id': '-77521_162222515', 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', + 'description': 'Видео из официальной группы Noize MC\nhttp://vk.com/noizemc', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'uploader_id': '39545378', 'duration': 195, @@ -165,6 +166,7 @@ class VKIE(VKBaseIE): 'id': '-93049196_456239755', 'ext': 'mp4', 'title': '8 серия (озвучка)', + 'description': 'Видео из официальной группы Noize MC\nhttp://vk.com/noizemc', 'duration': 8383, 'comment_count': int, 'uploader': 'Dizi2021', @@ -240,6 +242,7 @@ class VKIE(VKBaseIE): 'upload_date': '20221005', 'uploader': 'Шальная Императрица', 'uploader_id': '-74006511', + 'description': 'md5:f9315f7786fa0e84e75e4f824a48b056', }, }, { @@ -278,6 +281,25 @@ class VKIE(VKBaseIE): }, 'skip': 'No formats found', }, + { + 'note': 'video has chapters', + 'url': 'https://vkvideo.ru/video-18403220_456239696', + 'info_dict': { + 'id': '-18403220_456239696', + 'ext': 'mp4', + 'title': 'Трамп отменяет гранты // DeepSeek - Революция в ИИ // Илон Маск читер', + 'description': 'md5:b112ea9de53683b6d03d29076f62eec2', + 'uploader': 'Руслан Усачев', + 'uploader_id': '-18403220', + 'comment_count': int, + 'like_count': int, + 'duration': 1983, + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:21', + 'timestamp': 1738252883, + 'upload_date': '20250130', + }, + }, { # live stream, hls and rtmp links, most likely already finished live # stream by the time you are reading this comment @@ -449,7 +471,6 @@ def _real_extract(self, url): return self.url_result(opts_url) data = player['params'][0] - title = unescapeHTML(data['md_title']) # 2 = live # 3 = post live (finished live) @@ -507,17 +528,29 @@ def _real_extract(self, url): return { 'id': video_id, 'formats': formats, - 'title': title, - 'thumbnail': data.get('jpg'), - 'uploader': data.get('md_author'), - 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')), - 'duration': int_or_none(data.get('duration') or mv_data.get('duration')), + 'subtitles': subtitles, + **traverse_obj(mv_data, { + 'title': ('title', {unescapeHTML}), + 'description': ('desc', {clean_html}, filter), + 'duration': ('duration', {int_or_none}), + 'like_count': ('likes', {int_or_none}), + 'comment_count': ('commcount', {int_or_none}), + }), + **traverse_obj(data, { + 'title': ('md_title', {unescapeHTML}), + 'description': ('description', {clean_html}, filter), + 'thumbnail': ('jpg', {url_or_none}), + 'uploader': ('md_author', {str}), + 'uploader_id': (('author_id', 'authorId'), {str_or_none}, any), + 'duration': ('duration', {int_or_none}), + 'chapters': ('time_codes', lambda _, v: isinstance(v['time'], int), { + 'title': ('text', {str}), + 'start_time': 'time', + }), + }), 'timestamp': timestamp, 'view_count': view_count, - 'like_count': int_or_none(mv_data.get('likes')), - 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, - 'subtitles': subtitles, '_format_sort_fields': ('res', 'source'), } From 4432a9390c79253ac830702b226d2e558b636725 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Thu, 13 Mar 2025 17:37:33 +1300 Subject: [PATCH 09/81] [ie/youtube] Split into package (#12557) Authored by: coletdjnz --- yt_dlp/extractor/youtube/__init__.py | 50 + yt_dlp/extractor/youtube/_base.py | 1145 +++++ yt_dlp/extractor/youtube/_clip.py | 66 + yt_dlp/extractor/youtube/_mistakes.py | 69 + yt_dlp/extractor/youtube/_notifications.py | 98 + yt_dlp/extractor/youtube/_redirect.py | 247 + yt_dlp/extractor/youtube/_search.py | 167 + yt_dlp/extractor/youtube/_tab.py | 2348 ++++++++++ .../{youtube.py => youtube/_video.py} | 4078 +---------------- 9 files changed, 4203 insertions(+), 4065 deletions(-) create mode 100644 yt_dlp/extractor/youtube/__init__.py create mode 100644 yt_dlp/extractor/youtube/_base.py create mode 100644 yt_dlp/extractor/youtube/_clip.py create mode 100644 yt_dlp/extractor/youtube/_mistakes.py create mode 100644 yt_dlp/extractor/youtube/_notifications.py create mode 100644 yt_dlp/extractor/youtube/_redirect.py create mode 100644 yt_dlp/extractor/youtube/_search.py create mode 100644 yt_dlp/extractor/youtube/_tab.py rename yt_dlp/extractor/{youtube.py => youtube/_video.py} (51%) diff --git a/yt_dlp/extractor/youtube/__init__.py b/yt_dlp/extractor/youtube/__init__.py new file mode 100644 index 0000000000..892d860b0f --- /dev/null +++ b/yt_dlp/extractor/youtube/__init__.py @@ -0,0 +1,50 @@ +# flake8: noqa: F401 +from ._base import YoutubeBaseInfoExtractor +from ._clip import YoutubeClipIE +from ._mistakes import YoutubeTruncatedIDIE, YoutubeTruncatedURLIE +from ._notifications import YoutubeNotificationsIE +from ._redirect import ( + YoutubeConsentRedirectIE, + YoutubeFavouritesIE, + YoutubeFeedsInfoExtractor, + YoutubeHistoryIE, + YoutubeLivestreamEmbedIE, + YoutubeRecommendedIE, + YoutubeShortsAudioPivotIE, + YoutubeSubscriptionsIE, + YoutubeWatchLaterIE, + YoutubeYtBeIE, + YoutubeYtUserIE, +) +from ._search import YoutubeMusicSearchURLIE, YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE +from ._tab import YoutubePlaylistIE, YoutubeTabBaseInfoExtractor, YoutubeTabIE +from ._video import YoutubeIE + +# Hack to allow plugin overrides work +for _cls in [ + YoutubeBaseInfoExtractor, + YoutubeClipIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeNotificationsIE, + YoutubeConsentRedirectIE, + YoutubeFavouritesIE, + YoutubeFeedsInfoExtractor, + YoutubeHistoryIE, + YoutubeLivestreamEmbedIE, + YoutubeRecommendedIE, + YoutubeShortsAudioPivotIE, + YoutubeSubscriptionsIE, + YoutubeWatchLaterIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeMusicSearchURLIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubePlaylistIE, + YoutubeTabBaseInfoExtractor, + YoutubeTabIE, + YoutubeIE, +]: + _cls.__module__ = 'yt_dlp.extractor.youtube' diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py new file mode 100644 index 0000000000..ba28189a69 --- /dev/null +++ b/yt_dlp/extractor/youtube/_base.py @@ -0,0 +1,1145 @@ +import calendar +import copy +import datetime as dt +import enum +import functools +import hashlib +import json +import re +import time +import urllib.parse + +from ..common import InfoExtractor +from ...networking.exceptions import HTTPError, network_exceptions +from ...utils import ( + ExtractorError, + bug_reports_message, + datetime_from_str, + filter_dict, + get_first, + int_or_none, + is_html, + join_nonempty, + parse_count, + qualities, + str_to_int, + traverse_obj, + try_call, + try_get, + unified_timestamp, + url_or_none, + variadic, +) + + +class _PoTokenContext(enum.Enum): + PLAYER = 'player' + GVS = 'gvs' + + +# any clients starting with _ cannot be explicitly requested by the user +INNERTUBE_CLIENTS = { + 'web': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20241126.01.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'SUPPORTS_COOKIES': True, + }, + # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats + 'web_safari': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20241126.01.00', + 'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15,gzip(gfe)', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'SUPPORTS_COOKIES': True, + }, + 'web_embedded': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_EMBEDDED_PLAYER', + 'clientVersion': '1.20241201.00.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, + 'SUPPORTS_COOKIES': True, + }, + 'web_music': { + 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_REMIX', + 'clientVersion': '1.20241127.01.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'SUPPORTS_COOKIES': True, + }, + # This client now requires sign-in for every video + 'web_creator': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_CREATOR', + 'clientVersion': '1.20241203.01.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'REQUIRE_AUTH': True, + 'SUPPORTS_COOKIES': True, + }, + 'android': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID', + 'clientVersion': '19.44.38', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/19.44.38 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + }, + # This client now requires sign-in for every video + 'android_music': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_MUSIC', + 'clientVersion': '7.27.52', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.music/7.27.52 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, + 'REQUIRE_JS_PLAYER': False, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'REQUIRE_AUTH': True, + }, + # This client now requires sign-in for every video + 'android_creator': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_CREATOR', + 'clientVersion': '24.45.100', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.creator/24.45.100 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, + 'REQUIRE_JS_PLAYER': False, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'REQUIRE_AUTH': True, + }, + # YouTube Kids videos aren't returned on this client for some reason + 'android_vr': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_VR', + 'clientVersion': '1.60.19', + 'deviceMake': 'Oculus', + 'deviceModel': 'Quest 3', + 'androidSdkVersion': 32, + 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.60.19 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', + 'osName': 'Android', + 'osVersion': '12L', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, + 'REQUIRE_JS_PLAYER': False, + }, + # iOS clients have HLS live streams. Setting device model to get 60fps formats. + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 + 'ios': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS', + 'clientVersion': '20.03.02', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.youtube/20.03.02 (iPhone16,2; U; CPU iOS 18_2_1 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '18.2.1.22C161', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'REQUIRE_JS_PLAYER': False, + }, + # This client now requires sign-in for every video + 'ios_music': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS_MUSIC', + 'clientVersion': '7.27.0', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.youtubemusic/7.27.0 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '18.1.0.22B83', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, + 'REQUIRE_JS_PLAYER': False, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'REQUIRE_AUTH': True, + }, + # This client now requires sign-in for every video + 'ios_creator': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS_CREATOR', + 'clientVersion': '24.45.100', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.ytcreator/24.45.100 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '18.1.0.22B83', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, + 'REQUIRE_JS_PLAYER': False, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'REQUIRE_AUTH': True, + }, + # mweb has 'ultralow' formats + # See: https://github.com/yt-dlp/yt-dlp/pull/557 + 'mweb': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'MWEB', + 'clientVersion': '2.20241202.07.00', + # mweb previously did not require PO Token with this UA + 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'SUPPORTS_COOKIES': True, + }, + 'tv': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5', + 'clientVersion': '7.20250120.19.00', + 'userAgent': 'Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, + 'SUPPORTS_COOKIES': True, + }, + # This client now requires sign-in for every video + # It was previously an age-gate workaround for videos that were `playable_in_embed` + # It may still be useful if signed into an EU account that is not age-verified + 'tv_embedded': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', + 'clientVersion': '2.0', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, + 'REQUIRE_AUTH': True, + 'SUPPORTS_COOKIES': True, + }, +} + + +def _split_innertube_client(client_name): + variant, *base = client_name.rsplit('.', 1) + if base: + return variant, base[0], variant + base, *variant = client_name.split('_', 1) + return client_name, base, variant[0] if variant else None + + +def short_client_name(client_name): + main, *parts = _split_innertube_client(client_name)[0].split('_') + return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper() + + +def build_innertube_clients(): + THIRD_PARTY = { + 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL + } + BASE_CLIENTS = ('ios', 'web', 'tv', 'mweb', 'android') + priority = qualities(BASE_CLIENTS[::-1]) + + for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): + ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') + ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg.setdefault('PO_TOKEN_REQUIRED_CONTEXTS', []) + ytcfg.setdefault('REQUIRE_AUTH', False) + ytcfg.setdefault('SUPPORTS_COOKIES', False) + ytcfg.setdefault('PLAYER_PARAMS', None) + ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') + + _, base_client, variant = _split_innertube_client(client) + ytcfg['priority'] = 10 * priority(base_client) + + if variant == 'embedded': + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY + ytcfg['priority'] -= 2 + elif variant: + ytcfg['priority'] -= 3 + + +build_innertube_clients() + + +class BadgeType(enum.Enum): + AVAILABILITY_UNLISTED = enum.auto() + AVAILABILITY_PRIVATE = enum.auto() + AVAILABILITY_PUBLIC = enum.auto() + AVAILABILITY_PREMIUM = enum.auto() + AVAILABILITY_SUBSCRIPTION = enum.auto() + LIVE_NOW = enum.auto() + VERIFIED = enum.auto() + + +CONFIGURATION_ARG_KEY = 'youtube' + + +class YoutubeBaseInfoExtractor(InfoExtractor): + """Provide base functions for Youtube extractors""" + + _RESERVED_NAMES = ( + r'channel|c|user|playlist|watch|w|v|embed|e|live|watch_popup|clip|' + r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' + r'browse|oembed|get_video_info|iframe_api|s/player|source|' + r'storefront|oops|index|account|t/terms|about|upload|signin|logout') + + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' + + # _NETRC_MACHINE = 'youtube' + + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = False + + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/docs/instances.md + r'(?:www\.)?invidious\.pussthecat\.org', + r'(?:www\.)?invidious\.zee\.li', + r'(?:www\.)?invidious\.ethibox\.fr', + r'(?:www\.)?iv\.ggtyler\.dev', + r'(?:www\.)?inv\.vern\.i2p', + r'(?:www\.)?am74vkcrjp2d5v36lcdqgsj2m6x36tbrkhsruoegwfcizzabnfgf5zyd\.onion', + r'(?:www\.)?inv\.riverside\.rocks', + r'(?:www\.)?invidious\.silur\.me', + r'(?:www\.)?inv\.bp\.projectsegfau\.lt', + r'(?:www\.)?invidious\.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid\.onion', + r'(?:www\.)?invidious\.slipfox\.xyz', + r'(?:www\.)?invidious\.esmail5pdn24shtvieloeedh7ehz3nrwcdivnfhfcedl7gf4kwddhkqd\.onion', + r'(?:www\.)?inv\.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad\.onion', + r'(?:www\.)?invidious\.tiekoetter\.com', + r'(?:www\.)?iv\.odysfvr23q5wgt7i456o5t3trw2cw5dgn56vbjfbq2m7xsc5vqbqpcyd\.onion', + r'(?:www\.)?invidious\.nerdvpn\.de', + r'(?:www\.)?invidious\.weblibre\.org', + r'(?:www\.)?inv\.odyssey346\.dev', + r'(?:www\.)?invidious\.dhusch\.de', + r'(?:www\.)?iv\.melmac\.space', + r'(?:www\.)?watch\.thekitty\.zone', + r'(?:www\.)?invidious\.privacydev\.net', + r'(?:www\.)?ng27owmagn5amdm7l5s3rsqxwscl5ynppnis5dqcasogkyxcfqn7psid\.onion', + r'(?:www\.)?invidious\.drivet\.xyz', + r'(?:www\.)?vid\.priv\.au', + r'(?:www\.)?euxxcnhsynwmfidvhjf6uzptsmh4dipkmgdmcmxxuo7tunp3ad2jrwyd\.onion', + r'(?:www\.)?inv\.vern\.cc', + r'(?:www\.)?invidious\.esmailelbob\.xyz', + r'(?:www\.)?invidious\.sethforprivacy\.com', + r'(?:www\.)?yt\.oelrichsgarcia\.de', + r'(?:www\.)?yt\.artemislena\.eu', + r'(?:www\.)?invidious\.flokinet\.to', + r'(?:www\.)?invidious\.baczek\.me', + r'(?:www\.)?y\.com\.sb', + r'(?:www\.)?invidious\.epicsite\.xyz', + r'(?:www\.)?invidious\.lidarshield\.cloud', + r'(?:www\.)?yt\.funami\.tech', + r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', + r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', + # youtube-dl invidious instances list + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', + # piped instances from https://github.com/TeamPiped/Piped/wiki/Instances + r'(?:www\.)?piped\.kavin\.rocks', + r'(?:www\.)?piped\.tokhmi\.xyz', + r'(?:www\.)?piped\.syncpundit\.io', + r'(?:www\.)?piped\.mha\.fi', + r'(?:www\.)?watch\.whatever\.social', + r'(?:www\.)?piped\.garudalinux\.org', + r'(?:www\.)?piped\.rivo\.lol', + r'(?:www\.)?piped-libre\.kavin\.rocks', + r'(?:www\.)?yt\.jae\.fi', + r'(?:www\.)?piped\.mint\.lgbt', + r'(?:www\.)?il\.ax', + r'(?:www\.)?piped\.esmailelbob\.xyz', + r'(?:www\.)?piped\.projectsegfau\.lt', + r'(?:www\.)?piped\.privacydev\.net', + r'(?:www\.)?piped\.palveluntarjoaja\.eu', + r'(?:www\.)?piped\.smnz\.de', + r'(?:www\.)?piped\.adminforge\.de', + r'(?:www\.)?watch\.whatevertinfoil\.de', + r'(?:www\.)?piped\.qdi\.fi', + r'(?:(?:www|cf)\.)?piped\.video', + r'(?:www\.)?piped\.aeong\.one', + r'(?:www\.)?piped\.moomoo\.me', + r'(?:www\.)?piped\.chauvet\.pro', + r'(?:www\.)?watch\.leptons\.xyz', + r'(?:www\.)?pd\.vern\.cc', + r'(?:www\.)?piped\.hostux\.net', + r'(?:www\.)?piped\.lunar\.icu', + # Hyperpipe instances from https://hyperpipe.codeberg.page/ + r'(?:www\.)?hyperpipe\.surge\.sh', + r'(?:www\.)?hyperpipe\.esmailelbob\.xyz', + r'(?:www\.)?listen\.whatever\.social', + r'(?:www\.)?music\.adminforge\.de', + ) + + # extracted from account/account_menu ep + # XXX: These are the supported YouTube UI and API languages, + # which is slightly different from languages supported for translation in YouTube studio + _SUPPORTED_LANG_CODES = [ + 'af', 'az', 'id', 'ms', 'bs', 'ca', 'cs', 'da', 'de', 'et', 'en-IN', 'en-GB', 'en', 'es', + 'es-419', 'es-US', 'eu', 'fil', 'fr', 'fr-CA', 'gl', 'hr', 'zu', 'is', 'it', 'sw', 'lv', + 'lt', 'hu', 'nl', 'no', 'uz', 'pl', 'pt-PT', 'pt', 'ro', 'sq', 'sk', 'sl', 'sr-Latn', 'fi', + 'sv', 'vi', 'tr', 'be', 'bg', 'ky', 'kk', 'mk', 'mn', 'ru', 'sr', 'uk', 'el', 'hy', 'iw', + 'ur', 'ar', 'fa', 'ne', 'mr', 'hi', 'as', 'bn', 'pa', 'gu', 'or', 'ta', 'te', 'kn', 'ml', + 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko', + ] + + _IGNORED_WARNINGS = { + 'Unavailable videos will be hidden during playback', + 'Unavailable videos are hidden', + } + + _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en + _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}' + + _NETRC_MACHINE = 'youtube' + + def ucid_or_none(self, ucid): + return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) + + def handle_or_none(self, handle): + return self._search_regex(rf'^({self._YT_HANDLE_RE})$', urllib.parse.unquote(handle or ''), + '@-handle', default=None) + + def handle_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', + urllib.parse.unquote(url or ''), 'channel handle', default=None) + + def ucid_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', + url, 'channel id', default=None) + + @functools.cached_property + def _preferred_lang(self): + """ + Returns a language code supported by YouTube for the user preferred language. + Returns None if no preferred language set. + """ + preferred_lang = self._configuration_arg('lang', ie_key='Youtube', casesense=True, default=[''])[0] + if not preferred_lang: + return + if preferred_lang not in self._SUPPORTED_LANG_CODES: + raise ExtractorError( + f'Unsupported language code: {preferred_lang}. Supported language codes (case-sensitive): {join_nonempty(*self._SUPPORTED_LANG_CODES, delim=", ")}.', + expected=True) + elif preferred_lang != 'en': + self.report_warning( + f'Preferring "{preferred_lang}" translated fields. Note that some metadata extraction may fail or be incorrect.') + return preferred_lang + + def _initialize_consent(self): + cookies = self._get_cookies('https://www.youtube.com/') + if cookies.get('__Secure-3PSID'): + return + socs = cookies.get('SOCS') + if socs and not socs.value.startswith('CAA'): # not consented + return + self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) + + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(urllib.parse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': self._preferred_lang or 'en', 'tz': 'UTC'}) + self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) + + def _initialize_cookie_auth(self): + yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() + if yt_sapisid or yt_1psapisid or yt_3psapisid: + self.write_debug('Found YouTube account cookies') + + def _real_initialize(self): + self._initialize_pref() + self._initialize_consent() + self._initialize_cookie_auth() + self._check_login_required() + + def _perform_login(self, username, password): + if username.startswith('oauth'): + raise ExtractorError( + f'Login with OAuth is no longer supported. {self._youtube_login_hint}', expected=True) + + self.report_warning( + f'Login with password is not supported for YouTube. {self._youtube_login_hint}') + + @property + def _youtube_login_hint(self): + return (f'{self._login_hint(method="cookies")}. Also see ' + 'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies ' + 'for tips on effectively exporting YouTube cookies') + + def _check_login_required(self): + if self._LOGIN_REQUIRED and not self.is_authenticated: + self.raise_login_required( + f'Login details are needed to download this content. {self._youtube_login_hint}', method=None) + + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=' + _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=' + + def _get_default_ytcfg(self, client='web'): + return copy.deepcopy(INNERTUBE_CLIENTS[client]) + + def _get_innertube_host(self, client='web'): + return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST'] + + def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'): + # try_get but with fallback to default ytcfg client values when present + _func = lambda y: try_get(y, getter, expected_type) + return _func(ytcfg) or _func(self._get_default_ytcfg(default_client)) + + def _extract_client_name(self, ytcfg, default_client='web'): + return self._ytcfg_get_safe( + ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], + lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), str, default_client) + + def _extract_client_version(self, ytcfg, default_client='web'): + return self._ytcfg_get_safe( + ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], + lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), str, default_client) + + def _select_api_hostname(self, req_api_hostname, default_client=None): + return (self._configuration_arg('innertube_host', [''], ie_key=CONFIGURATION_ARG_KEY)[0] + or req_api_hostname or self._get_innertube_host(default_client or 'web')) + + def _extract_context(self, ytcfg=None, default_client='web'): + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language and tz for extraction + client_context = traverse_obj(context, 'client', expected_type=dict, default={}) + client_context.update({'hl': self._preferred_lang or 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) + return context + + @staticmethod + def _make_sid_authorization(scheme, sid, origin, additional_parts): + timestamp = str(round(time.time())) + + hash_parts = [] + if additional_parts: + hash_parts.append(':'.join(additional_parts.values())) + hash_parts.extend([timestamp, sid, origin]) + sidhash = hashlib.sha1(' '.join(hash_parts).encode()).hexdigest() + + parts = [timestamp, sidhash] + if additional_parts: + parts.append(''.join(additional_parts)) + + return f'{scheme} {"_".join(parts)}' + + def _get_sid_cookies(self): + """ + Get SAPISID, 1PSAPISID, 3PSAPISID cookie values + @returns sapisid, 1psapisid, 3psapisid + """ + yt_cookies = self._get_cookies('https://www.youtube.com') + yt_sapisid = try_call(lambda: yt_cookies['SAPISID'].value) + yt_3papisid = try_call(lambda: yt_cookies['__Secure-3PAPISID'].value) + yt_1papisid = try_call(lambda: yt_cookies['__Secure-1PAPISID'].value) + + # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is. + # YouTube also falls back to __Secure-3PAPISID if SAPISID is missing. + # See: https://github.com/yt-dlp/yt-dlp/issues/393 + + return yt_sapisid or yt_3papisid, yt_1papisid, yt_3papisid + + def _get_sid_authorization_header(self, origin='https://www.youtube.com', user_session_id=None): + """ + Generate API Session ID Authorization for Innertube requests. Assumes all requests are secure (https). + @param origin: Origin URL + @param user_session_id: Optional User Session ID + @return: Authorization header value + """ + + authorizations = [] + additional_parts = {} + if user_session_id: + additional_parts['u'] = user_session_id + + yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() + + for scheme, sid in (('SAPISIDHASH', yt_sapisid), + ('SAPISID1PHASH', yt_1psapisid), + ('SAPISID3PHASH', yt_3psapisid)): + if sid: + authorizations.append(self._make_sid_authorization(scheme, sid, origin, additional_parts)) + + if not authorizations: + return None + + return ' '.join(authorizations) + + def _call_api(self, ep, query, video_id, fatal=True, headers=None, + note='Downloading API JSON', errnote='Unable to download API page', + context=None, api_key=None, api_hostname=None, default_client='web'): + + data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)} + data.update(query) + real_headers = self.generate_api_headers(default_client=default_client) + real_headers.update({'content-type': 'application/json'}) + if headers: + real_headers.update(headers) + return self._download_json( + f'https://{self._select_api_hostname(api_hostname, default_client)}/youtubei/v1/{ep}', + video_id=video_id, fatal=fatal, note=note, errnote=errnote, + data=json.dumps(data).encode('utf8'), headers=real_headers, + query=filter_dict({ + 'key': self._configuration_arg( + 'innertube_key', [api_key], ie_key=CONFIGURATION_ARG_KEY, casesense=True)[0], + 'prettyPrint': 'false', + }, cndn=lambda _, v: v)) + + def extract_yt_initial_data(self, item_id, webpage, fatal=True): + return self._search_json(self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', item_id, fatal=fatal) + + @staticmethod + def _extract_session_index(*data): + """ + Index of current account in account list. + See: https://github.com/yt-dlp/yt-dlp/pull/519 + """ + for ytcfg in data: + session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX'])) + if session_index is not None: + return session_index + + @staticmethod + def _parse_data_sync_id(data_sync_id): + """ + Parse data_sync_id into delegated_session_id and user_session_id. + + data_sync_id is of the form "delegated_session_id||user_session_id" for secondary channel + and just "user_session_id||" for primary channel. + + @param data_sync_id: data_sync_id string + @return: Tuple of (delegated_session_id, user_session_id) + """ + if not data_sync_id: + return None, None + first, _, second = data_sync_id.partition('||') + if second: + return first, second + return None, first + + def _extract_delegated_session_id(self, *args): + """ + Extract current delegated session ID required to download private playlists of secondary channels + @params response and/or ytcfg + @return: delegated session ID + """ + # ytcfg includes channel_syncid if on secondary channel + if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): + return delegated_sid + + data_sync_id = self._extract_data_sync_id(*args) + return self._parse_data_sync_id(data_sync_id)[0] + + def _extract_user_session_id(self, *args): + """ + Extract current user session ID + @params response and/or ytcfg + @return: user session ID + """ + if user_sid := traverse_obj(args, (..., 'USER_SESSION_ID', {str}, any)): + return user_sid + + data_sync_id = self._extract_data_sync_id(*args) + return self._parse_data_sync_id(data_sync_id)[1] + + def _extract_data_sync_id(self, *args): + """ + Extract current account dataSyncId. + In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID|| + @params response and/or ytcfg + """ + if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=CONFIGURATION_ARG_KEY, casesense=True)[0]: + return data_sync_id + + return traverse_obj( + args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any)) + + def _extract_visitor_data(self, *args): + """ + Extracts visitorData from an API response or ytcfg + Appears to be used to track session state + """ + if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=CONFIGURATION_ARG_KEY, casesense=True)[0]: + return visitor_data + return get_first( + args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], + expected_type=str) + + @functools.cached_property + def is_authenticated(self): + return bool(self._get_sid_authorization_header()) + + def extract_ytcfg(self, video_id, webpage): + if not webpage: + return {} + return self._parse_json( + self._search_regex( + r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', + default='{}'), video_id, fatal=False) or {} + + def _generate_cookie_auth_headers(self, *, ytcfg=None, delegated_session_id=None, user_session_id=None, session_index=None, origin=None, **kwargs): + headers = {} + delegated_session_id = delegated_session_id or self._extract_delegated_session_id(ytcfg) + if delegated_session_id: + headers['X-Goog-PageId'] = delegated_session_id + if session_index is None: + session_index = self._extract_session_index(ytcfg) + if delegated_session_id or session_index is not None: + headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 + + auth = self._get_sid_authorization_header(origin, user_session_id=user_session_id or self._extract_user_session_id(ytcfg)) + if auth is not None: + headers['Authorization'] = auth + headers['X-Origin'] = origin + + if traverse_obj(ytcfg, 'LOGGED_IN', expected_type=bool): + headers['X-Youtube-Bootstrap-Logged-In'] = 'true' + + return headers + + def generate_api_headers( + self, *, ytcfg=None, delegated_session_id=None, user_session_id=None, session_index=None, + visitor_data=None, api_hostname=None, default_client='web', **kwargs): + + origin = 'https://' + (self._select_api_hostname(api_hostname, default_client)) + headers = { + 'X-YouTube-Client-Name': str( + self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), + 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), + 'Origin': origin, + 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), + 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client), + **self._generate_cookie_auth_headers( + ytcfg=ytcfg, + delegated_session_id=delegated_session_id, + user_session_id=user_session_id, + session_index=session_index, + origin=origin), + } + return filter_dict(headers) + + def _download_webpage_with_retries(self, *args, retry_fatal=False, retry_on_status=None, **kwargs): + for retry in self.RetryManager(fatal=retry_fatal): + try: + return self._download_webpage(*args, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, network_exceptions): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (retry_on_status or (403, 429)): + retry.error = e + continue + self._error_or_warning(e, fatal=retry_fatal) + break + + def _download_ytcfg(self, client, video_id): + url = { + 'web': 'https://www.youtube.com', + 'web_music': 'https://music.youtube.com', + 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1', + 'tv': 'https://www.youtube.com/tv', + }.get(client) + if not url: + return {} + webpage = self._download_webpage_with_retries( + url, video_id, note=f'Downloading {client.replace("_", " ").strip()} client config', + headers=traverse_obj(self._get_default_ytcfg(client), { + 'User-Agent': ('INNERTUBE_CONTEXT', 'client', 'userAgent', {str}), + })) + return self.extract_ytcfg(video_id, webpage) or {} + + @staticmethod + def _build_api_continuation_query(continuation, ctp=None): + query = { + 'continuation': continuation, + } + # TODO: Inconsistency with clickTrackingParams. + # Currently we have a fixed ctp contained within context (from ytcfg) + # and a ctp in root query for continuation. + if ctp: + query['clickTracking'] = {'clickTrackingParams': ctp} + return query + + @classmethod + def _extract_next_continuation_data(cls, renderer): + next_continuation = try_get( + renderer, (lambda x: x['continuations'][0]['nextContinuationData'], + lambda x: x['continuation']['reloadContinuationData']), dict) + if not next_continuation: + return + continuation = next_continuation.get('continuation') + if not continuation: + return + ctp = next_continuation.get('clickTrackingParams') + return cls._build_api_continuation_query(continuation, ctp) + + @classmethod + def _extract_continuation_ep_data(cls, continuation_ep: dict): + if isinstance(continuation_ep, dict): + continuation = try_get( + continuation_ep, lambda x: x['continuationCommand']['token'], str) + if not continuation: + return + ctp = continuation_ep.get('clickTrackingParams') + return cls._build_api_continuation_query(continuation, ctp) + + @classmethod + def _extract_continuation(cls, renderer): + next_continuation = cls._extract_next_continuation_data(renderer) + if next_continuation: + return next_continuation + + return traverse_obj(renderer, ( + ('contents', 'items', 'rows'), ..., 'continuationItemRenderer', + ('continuationEndpoint', ('button', 'buttonRenderer', 'command')), + ), get_all=False, expected_type=cls._extract_continuation_ep_data) + + @classmethod + def _extract_alerts(cls, data): + for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: + if not isinstance(alert_dict, dict): + continue + for alert in alert_dict.values(): + alert_type = alert.get('type') + if not alert_type: + continue + message = cls._get_text(alert, 'text') + if message: + yield alert_type, message + + def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): + errors, warnings = [], [] + for alert_type, alert_message in alerts: + if alert_type.lower() == 'error' and fatal: + errors.append([alert_type, alert_message]) + elif alert_message not in self._IGNORED_WARNINGS: + warnings.append([alert_type, alert_message]) + + for alert_type, alert_message in (warnings + errors[:-1]): + self.report_warning(f'YouTube said: {alert_type} - {alert_message}', only_once=only_once) + if errors: + raise ExtractorError(f'YouTube said: {errors[-1][1]}', expected=expected) + + def _extract_and_report_alerts(self, data, *args, **kwargs): + return self._report_alerts(self._extract_alerts(data), *args, **kwargs) + + def _extract_badges(self, badge_list: list): + """ + Extract known BadgeType's from a list of badge renderers. + @returns [{'type': BadgeType}] + """ + icon_type_map = { + 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, + 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, + 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, + 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, + 'CHECK': BadgeType.VERIFIED, + } + + badge_style_map = { + 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, + 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, + 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED, + } + + label_map = { + 'unlisted': BadgeType.AVAILABILITY_UNLISTED, + 'private': BadgeType.AVAILABILITY_PRIVATE, + 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'live': BadgeType.LIVE_NOW, + 'premium': BadgeType.AVAILABILITY_PREMIUM, + 'verified': BadgeType.VERIFIED, + 'official artist channel': BadgeType.VERIFIED, + } + + badges = [] + for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): + badge_type = ( + icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + or badge_style_map.get(traverse_obj(badge, 'style')) + ) + if badge_type: + badges.append({'type': badge_type}) + continue + + # fallback, won't work in some languages + label = traverse_obj( + badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') + for match, label_badge_type in label_map.items(): + if match in label.lower(): + badges.append({'type': label_badge_type}) + break + + return badges + + @staticmethod + def _has_badge(badges, badge_type): + return bool(traverse_obj(badges, lambda _, v: v['type'] == badge_type)) + + @staticmethod + def _get_text(data, *path_list, max_runs=None): + for path in path_list or [None]: + if path is None: + obj = [data] + else: + obj = traverse_obj(data, path, default=[]) + if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)): + obj = [obj] + for item in obj: + text = try_get(item, lambda x: x['simpleText'], str) + if text: + return text + runs = try_get(item, lambda x: x['runs'], list) or [] + if not runs and isinstance(item, list): + runs = item + + runs = runs[:min(len(runs), max_runs or len(runs))] + text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str)) + if text: + return text + + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + + @staticmethod + def _extract_thumbnails(data, *path_list, final_key='thumbnails'): + """ + Extract thumbnails from thumbnails dict + @param path_list: path list to level that contains 'thumbnails' key + """ + thumbnails = [] + for path in path_list or [()]: + for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)): + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/yt-dlp/yt-dlp/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + return thumbnails + + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' + """ + + # XXX: this could be moved to a general function in utils/_utils.py + # The relative time text strings are roughly the same as what + # Javascript's Intl.RelativeTimeFormat function generates. + # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat + mobj = re.search( + r'(?Ptoday|yesterday|now)|(?P