diff --git a/README.md b/README.md index e476c0084b..925ebd8c5b 100644 --- a/README.md +++ b/README.md @@ -1799,6 +1799,7 @@ #### youtube * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively * `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv`, `tv_simply` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details +* `webpage_skip`: Skip extraction of embedded webpage data. One or both of `player_response`, `initial_data`. These options are for testing purposes and don't skip any network requests * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `player_js_variant`: The player javascript variant to use for signature and nsig deciphering. The known variants are: `main`, `tce`, `tv`, `tv_es6`, `phone`, `tablet`. Only `main` is recommended as a possible workaround; the others are for debugging purposes. The default is to use what is prescribed by the site, and can be selected with `actual` * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -1900,6 +1901,10 @@ #### sonylivseries #### tver * `backend`: Backend API to use for extraction - one of `streaks` (default) or `brightcove` (deprecated) +#### vimeo +* `client`: Client to extract video data from. One of `android` (default), `ios` or `web`. The `ios` client only works with previously cached OAuth tokens. The `web` client only works when authenticated with credentials or account cookies +* `original_format_policy`: Policy for when to try extracting original formats. One of `always`, `never`, or `auto`. The default `auto` policy tries to avoid exceeding the API rate-limit by only making an extra request when Vimeo publicizes the video's downloadability + **Note**: These options may be changed/removed in the future without concern for backward compatibility diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index c9f70431f7..40dd05e136 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1959,6 +1959,37 @@ def test_search_nextjs_data(self): with self.assertWarns(DeprecationWarning): self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) + def test_search_nextjs_v13_data(self): + HTML = R''' + + + + + + + + ''' + EXPECTED = { + '18': { + 'foo': 'bar', + }, + '16': { + 'meta': { + 'dateCreated': 1730489700, + 'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907', + }, + }, + '19': { + 'duplicated_field_name': {'x': 1}, + }, + '20': { + 'duplicated_field_name': {'y': 2}, + }, + } + self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED) + self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), {}) + self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), {}) + def test_search_nuxt_json(self): HTML_TMPL = '' VALID_DATA = ''' diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 294fdbb083..c9172fef78 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -201,7 +201,6 @@ BanByeChannelIE, BanByeIE, ) -from .bandaichannel import BandaiChannelIE from .bandcamp import ( BandcampAlbumIE, BandcampIE, @@ -229,7 +228,6 @@ from .beatport import BeatportIE from .beeg import BeegIE from .behindkink import BehindKinkIE -from .bellmedia import BellMediaIE from .berufetv import BerufeTVIE from .bet import BetIE from .bfi import BFIPlayerIE @@ -309,6 +307,7 @@ BrilliantpalaClassesIE, BrilliantpalaElearnIE, ) +from .btvplus import BTVPlusIE from .bundesliga import BundesligaIE from .bundestag import BundestagIE from .bunnycdn import BunnyCdnIE @@ -446,7 +445,6 @@ CSpanIE, ) from .ctsnews import CtsNewsIE -from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( @@ -928,7 +926,6 @@ JioSaavnSongIE, ) from .joj import JojIE -from .joqrag import JoqrAgIE from .jove import JoveIE from .jstream import JStreamIE from .jtbc import ( @@ -1031,11 +1028,6 @@ LikeeIE, LikeeUserIE, ) -from .limelight import ( - LimelightChannelIE, - LimelightChannelListIE, - LimelightMediaIE, -) from .linkedin import ( LinkedInEventsIE, LinkedInIE, @@ -1168,6 +1160,10 @@ MixcloudPlaylistIE, MixcloudUserIE, ) +from .mixlr import ( + MixlrIE, + MixlrRecoringIE, +) from .mlb import ( MLBIE, MLBTVIE, @@ -1378,7 +1374,6 @@ from .noice import NoicePodcastIE from .nonktube import NonkTubeIE from .noodlemagazine import NoodleMagazineIE -from .noovo import NoovoIE from .nosnl import NOSNLArticleIE from .nova import ( NovaEmbedIE, @@ -2286,6 +2281,7 @@ ) from .umg import UMGDeIE from .unistra import UnistraIE +from .unitednations import UnitedNationsWebTvIE from .unity import UnityIE from .unsupported import ( KnownDRMIE, diff --git a/yt_dlp/extractor/bandaichannel.py b/yt_dlp/extractor/bandaichannel.py deleted file mode 100644 index d7fcf44bd9..0000000000 --- a/yt_dlp/extractor/bandaichannel.py +++ /dev/null @@ -1,33 +0,0 @@ -from .brightcove import BrightcoveNewBaseIE -from ..utils import extract_attributes - - -class BandaiChannelIE(BrightcoveNewBaseIE): - IE_NAME = 'bandaichannel' - _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P\d+/\d+)' - _TESTS = [{ - 'url': 'https://www.b-ch.com/titles/514/001', - 'md5': 'a0f2d787baa5729bed71108257f613a4', - 'info_dict': { - 'id': '6128044564001', - 'ext': 'mp4', - 'title': 'メタルファイターMIKU 第1話', - 'timestamp': 1580354056, - 'uploader_id': '5797077852001', - 'upload_date': '20200130', - 'duration': 1387.733, - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - attrs = extract_attributes(self._search_regex( - r'(]+\bid="bcplayer"[^>]*>)', webpage, 'player')) - bc = self._download_json( - 'https://pbifcd.b-ch.com/v1/playbackinfo/ST/70/' + attrs['data-info'], - video_id, headers={'X-API-KEY': attrs['data-auth'].strip()})['bc'] - return self._parse_brightcove_metadata(bc, bc['id']) diff --git a/yt_dlp/extractor/bellmedia.py b/yt_dlp/extractor/bellmedia.py deleted file mode 100644 index ac45dd4779..0000000000 --- a/yt_dlp/extractor/bellmedia.py +++ /dev/null @@ -1,91 +0,0 @@ -from .common import InfoExtractor - - -class BellMediaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?P - (?: - ctv| - tsn| - bnn(?:bloomberg)?| - thecomedynetwork| - discovery| - discoveryvelocity| - sciencechannel| - investigationdiscovery| - animalplanet| - bravo| - mtv| - space| - etalk| - marilyn - )\.ca| - (?:much|cp24)\.com - )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' - _TESTS = [{ - 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', - 'md5': '3e5b8e38370741d5089da79161646635', - 'info_dict': { - 'id': '1403070', - 'ext': 'flv', - 'title': 'David Cockfield\'s Top Picks', - 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', - 'upload_date': '20180525', - 'timestamp': 1527288600, - 'season_id': '73997', - 'season': '2018', - 'thumbnail': 'http://images2.9c9media.com/image_asset/2018_5_25_baf30cbd-b28d-4a18-9903-4bb8713b00f5_PNG_956x536.jpg', - 'tags': [], - 'categories': ['ETFs'], - 'season_number': 8, - 'duration': 272.038, - 'series': 'Market Call Tonight', - }, - }, { - 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', - 'only_matching': True, - }, { - 'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549', - 'only_matching': True, - }, { - 'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654', - 'only_matching': True, - }, { - 'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009', - 'only_matching': True, - }, { - 'url': 'http://www.much.com/shows/atmidnight/episode948007/tuesday-september-13-2016', - 'only_matching': True, - }, { - 'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6', - 'only_matching': True, - }, { - 'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430', - 'only_matching': True, - }, { - 'url': 'http://www.etalk.ca/video?videoid=663455', - 'only_matching': True, - }, { - 'url': 'https://www.cp24.com/video?clipId=1982548', - 'only_matching': True, - }] - _DOMAINS = { - 'thecomedynetwork': 'comedy', - 'discoveryvelocity': 'discvel', - 'sciencechannel': 'discsci', - 'investigationdiscovery': 'invdisc', - 'animalplanet': 'aniplan', - 'etalk': 'ctv', - 'bnnbloomberg': 'bnn', - 'marilyn': 'ctv_marilyn', - } - - def _real_extract(self, url): - domain, video_id = self._match_valid_url(url).groups() - domain = domain.split('.')[0] - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': f'9c9media:{self._DOMAINS.get(domain, domain)}_web:{video_id}', - 'ie_key': 'NineCNineMedia', - } diff --git a/yt_dlp/extractor/btvplus.py b/yt_dlp/extractor/btvplus.py new file mode 100644 index 0000000000..531ace1471 --- /dev/null +++ b/yt_dlp/extractor/btvplus.py @@ -0,0 +1,73 @@ +from .common import InfoExtractor +from ..utils import ( + bug_reports_message, + clean_html, + get_element_by_class, + js_to_json, + mimetype2ext, + strip_or_none, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class BTVPlusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?btvplus\.bg/produkt/(?:predavaniya|seriali|novini)/(?P\d+)' + _TESTS = [{ + 'url': 'https://btvplus.bg/produkt/predavaniya/67271/btv-reporterite/btv-reporterite-12-07-2025-g', + 'info_dict': { + 'ext': 'mp4', + 'id': '67271', + 'title': 'bTV Репортерите - 12.07.2025 г.', + 'thumbnail': 'https://cdn.btv.bg/media/images/940x529/Jul2025/2113606319.jpg', + }, + }, { + 'url': 'https://btvplus.bg/produkt/seriali/66942/sezon-2/plen-sezon-2-epizod-55', + 'info_dict': { + 'ext': 'mp4', + 'id': '66942', + 'title': 'Плен - сезон 2, епизод 55', + 'thumbnail': 'https://cdn.btv.bg/media/images/940x529/Jun2025/2113595104.jpg', + }, + }, { + 'url': 'https://btvplus.bg/produkt/novini/67270/btv-novinite-centralna-emisija-12-07-2025', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_url = self._search_regex( + r'var\s+videoUrl\s*=\s*[\'"]([^\'"]+)[\'"]', + webpage, 'player URL') + + player_config = self._download_json( + urljoin('https://btvplus.bg', player_url), video_id)['config'] + + videojs_data = self._search_json( + r'videojs\(["\'][^"\']+["\'],', player_config, 'videojs data', + video_id, transform_source=js_to_json) + formats = [] + subtitles = {} + for src in traverse_obj(videojs_data, ('sources', lambda _, v: url_or_none(v['src']))): + ext = mimetype2ext(src.get('type')) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src['src'], video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + self.report_warning(f'Unknown format type {ext}{bug_reports_message()}') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': ( + strip_or_none(self._og_search_title(webpage, default=None)) + or clean_html(get_element_by_class('product-title', webpage))), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + } diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b75e806233..d601e17514 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1783,6 +1783,59 @@ def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAU r']+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data', video_id, end_pattern='', fatal=fatal, default=default, **kw) + def _search_nextjs_v13_data(self, webpage, video_id, fatal=True): + """Parses Next.js app router flight data that was introduced in Next.js v13""" + nextjs_data = {} + if not fatal and not isinstance(webpage, str): + return nextjs_data + + def flatten(flight_data): + if not isinstance(flight_data, list): + return + if len(flight_data) == 4 and flight_data[0] == '$': + _, name, _, data = flight_data + if not isinstance(data, dict): + return + children = data.pop('children', None) + if data and isinstance(name, str) and re.fullmatch(r'\$L[0-9a-f]+', name): + # It is useful hydration JSON data + nextjs_data[name[2:]] = data + flatten(children) + return + for f in flight_data: + flatten(f) + + flight_text = '' + # The pattern for the surrounding JS/tag should be strict as it's a hardcoded string in the next.js source + # Ref: https://github.com/vercel/next.js/blob/5a4a08fdc/packages/next/src/server/app-render/use-flight-response.tsx#L189 + for flight_segment in re.findall(r']*>self\.__next_f\.push\((\[.+?\])\)', webpage): + segment = self._parse_json(flight_segment, video_id, fatal=fatal, errnote=None if fatal else False) + # Some earlier versions of next.js "optimized" away this array structure; this is unsupported + # Ref: https://github.com/vercel/next.js/commit/0123a9d5c9a9a77a86f135b7ae30b46ca986d761 + if not isinstance(segment, list) or len(segment) != 2: + self.write_debug( + f'{video_id}: Unsupported next.js flight data structure detected', only_once=True) + continue + # Only use the relevant payload type (1 == data) + # Ref: https://github.com/vercel/next.js/blob/5a4a08fdc/packages/next/src/server/app-render/use-flight-response.tsx#L11-L14 + payload_type, chunk = segment + if payload_type == 1: + flight_text += chunk + + for f in flight_text.splitlines(): + prefix, _, body = f.lstrip().partition(':') + if not re.fullmatch(r'[0-9a-f]+', prefix): + continue + # The body still isn't guaranteed to be valid JSON, so parsing should always be non-fatal + if body.startswith('[') and body.endswith(']'): + flatten(self._parse_json(body, video_id, fatal=False, errnote=False)) + elif body.startswith('{') and body.endswith('}'): + data = self._parse_json(body, video_id, fatal=False, errnote=False) + if data is not None: + nextjs_data[prefix] = data + + return nextjs_data + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) diff --git a/yt_dlp/extractor/ctv.py b/yt_dlp/extractor/ctv.py deleted file mode 100644 index a41dab11b1..0000000000 --- a/yt_dlp/extractor/ctv.py +++ /dev/null @@ -1,49 +0,0 @@ -from .common import InfoExtractor - - -class CTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ctv\.ca/(?P(?:show|movie)s/[^/]+/[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.ctv.ca/shows/your-morning/wednesday-december-23-2020-s5e88', - 'info_dict': { - 'id': '2102249', - 'ext': 'flv', - 'title': 'Wednesday, December 23, 2020', - 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'Your Morning delivers original perspectives and unique insights into the headlines of the day.', - 'timestamp': 1608732000, - 'upload_date': '20201223', - 'series': 'Your Morning', - 'season': '2020-2021', - 'season_number': 5, - 'episode_number': 88, - 'tags': ['Your Morning'], - 'categories': ['Talk Show'], - 'duration': 7467.126, - }, - }, { - 'url': 'https://www.ctv.ca/movies/adam-sandlers-eight-crazy-nights/adam-sandlers-eight-crazy-nights', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - content = self._download_json( - 'https://www.ctv.ca/space-graphql/graphql', display_id, query={ - 'query': '''{ - resolvedPath(path: "/%s") { - lastSegment { - content { - ... on AxisContent { - axisId - videoPlayerDestCode - } - } - } - } -}''' % display_id, # noqa: UP031 - })['data']['resolvedPath']['lastSegment']['content'] - video_id = content['axisId'] - return self.url_result( - '9c9media:{}:{}'.format(content['videoPlayerDestCode'], video_id), - 'NineCNineMedia', video_id) diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 5c9f8e36dd..54c2c53aca 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -1,4 +1,3 @@ -import json import re import urllib.parse @@ -19,7 +18,11 @@ unsmuggle_url, url_or_none, ) -from ..utils.traversal import find_element, traverse_obj +from ..utils.traversal import ( + find_element, + get_first, + traverse_obj, +) class FranceTVBaseInfoExtractor(InfoExtractor): @@ -121,9 +124,10 @@ def _extract_video(self, video_id, hostname=None): elif code := traverse_obj(dinfo, ('code', {int})): if code == 2009: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - elif code in (2015, 2017): + elif code in (2015, 2017, 2019): # 2015: L'accès à cette vidéo est impossible. (DRM-only) # 2017: Cette vidéo n'est pas disponible depuis le site web mobile (b/c DRM) + # 2019: L'accès à cette vidéo est incompatible avec votre configuration. (DRM-only) drm_formats = True continue self.report_warning( @@ -258,7 +262,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): _TESTS = [{ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'info_dict': { - 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', # old: c5bda21d-2c6f-4470-8849-3d8327adb2ba' + 'id': 'b2cf9fd8-e971-4757-8651-848f2772df61', # old: ec217ecc-0733-48cf-ac06-af1347b849d1 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', 'timestamp': 1502623500, @@ -269,7 +273,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [FranceTVIE.ie_key()], + 'skip': 'Unfortunately, this video is no longer available', }, { # geo-restricted 'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html', @@ -287,7 +291,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1441, }, - 'skip': 'No longer available', + 'skip': 'Unfortunately, this video is no longer available', }, { # geo-restricted livestream (workflow == 'token-akamai') 'url': 'https://www.france.tv/france-4/direct.html', @@ -308,6 +312,19 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'live_status': 'is_live', }, 'params': {'skip_download': 'livestream'}, + }, { + # Not geo-restricted + 'url': 'https://www.france.tv/france-2/la-maison-des-maternelles/5574051-nous-sommes-amis-et-nous-avons-fait-un-enfant-ensemble.html', + 'info_dict': { + 'id': 'b448bfe4-9fe7-11ee-97d8-2ba3426fa3df', + 'ext': 'mp4', + 'title': 'Nous sommes amis et nous avons fait un enfant ensemble - Émission du jeudi 21 décembre 2023', + 'duration': 1065, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1703147921, + 'upload_date': '20231221', + }, + 'params': {'skip_download': 'm3u8'}, }, { # france3 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', @@ -342,30 +359,16 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'only_matching': True, }] - # XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.goplay - def _find_json(self, s): - return self._search_json( - r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None) - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + nextjs_data = self._search_nextjs_v13_data(webpage, display_id) - nextjs_data = traverse_obj( - re.findall(r']*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*', webpage), - (..., {json.loads}, ..., {self._find_json}, ..., 'children', ..., ..., 'children', ..., ..., 'children')) - - if traverse_obj(nextjs_data, (..., ..., 'children', ..., 'isLive', {bool}, any)): + if get_first(nextjs_data, ('isLive', {bool})): # For livestreams we need the id of the stream instead of the currently airing episode id - video_id = traverse_obj(nextjs_data, ( - ..., ..., 'children', ..., 'children', ..., 'children', ..., 'children', ..., ..., - 'children', ..., ..., 'children', ..., ..., 'children', (..., (..., ...)), - 'options', 'id', {str}, any)) + video_id = get_first(nextjs_data, ('options', 'id', {str})) else: - video_id = traverse_obj(nextjs_data, ( - ..., ..., ..., 'children', - lambda _, v: v['video']['url'] == urllib.parse.urlparse(url).path, - 'video', ('playerReplayId', 'siId'), {str}, any)) + video_id = get_first(nextjs_data, ('video', ('playerReplayId', 'siId'), {str})) if not video_id: raise ExtractorError('Unable to extract video ID') diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 721d04e317..d9a666f991 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1481,30 +1481,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['SenateISVP'], }, - { - # Limelight embeds (1 channel embed + 4 media embeds) - 'url': 'http://www.sedona.com/FacilitatorTraining2017', - 'info_dict': { - 'id': 'FacilitatorTraining2017', - 'title': 'Facilitator Training 2017', - }, - 'playlist_mincount': 5, - }, - { - # Limelight embed (LimelightPlayerUtil.embed) - 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', - 'info_dict': { - 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', - 'ext': 'mp4', - 'title': '07448641', - 'timestamp': 1499890639, - 'upload_date': '20170712', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['LimelightMedia'], - }, { 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', 'info_dict': { diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index c654c757c6..2e959cead2 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -5,16 +5,11 @@ import hmac import json import os -import re import urllib.parse from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - remove_end, - traverse_obj, -) +from ..utils import ExtractorError, int_or_none +from ..utils.traversal import get_first, traverse_obj class GoPlayIE(InfoExtractor): @@ -27,10 +22,10 @@ class GoPlayIE(InfoExtractor): 'info_dict': { 'id': '2baa4560-87a0-421b-bffc-359914e3c387', 'ext': 'mp4', - 'title': 'S22 - Aflevering 1', + 'title': 'De Slimste Mens ter Wereld - S22 - Aflevering 1', 'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}', 'series': 'De Slimste Mens ter Wereld', - 'episode': 'Episode 1', + 'episode': 'Wordt aangekondigd', 'season_number': 22, 'episode_number': 1, 'season': 'Season 22', @@ -52,7 +47,7 @@ class GoPlayIE(InfoExtractor): 'info_dict': { 'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee', 'ext': 'mp4', - 'title': 'S11 - Aflevering 1', + 'title': 'De Mol - S11 - Aflevering 1', 'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}', 'episode': 'Episode 1', 'series': 'De Mol', @@ -75,21 +70,13 @@ def _real_initialize(self): if not self._id_token: raise self.raise_login_required(method='password') - # XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv - def _find_json(self, s): - return self._search_json( - r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None) - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - nextjs_data = traverse_obj( - re.findall(r']*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*', webpage), - (..., {json.loads}, ..., {self._find_json}, ...)) - meta = traverse_obj(nextjs_data, ( - ..., ..., 'children', ..., ..., 'children', - lambda _, v: v['video']['path'] == urllib.parse.urlparse(url).path, 'video', any)) + nextjs_data = self._search_nextjs_v13_data(webpage, display_id) + meta = get_first(nextjs_data, ( + lambda k, v: k in ('video', 'meta') and v['path'] == urllib.parse.urlparse(url).path)) video_id = meta['uuid'] info_dict = traverse_obj(meta, { @@ -98,19 +85,18 @@ def _real_extract(self, url): }) if traverse_obj(meta, ('program', 'subtype')) != 'movie': - for season_data in traverse_obj(nextjs_data, (..., 'children', ..., 'playlists', ...)): - episode_data = traverse_obj( - season_data, ('videos', lambda _, v: v['videoId'] == video_id, any)) + for season_data in traverse_obj(nextjs_data, (..., 'playlists', ..., {dict})): + episode_data = traverse_obj(season_data, ('videos', lambda _, v: v['videoId'] == video_id, any)) if not episode_data: continue - episode_title = traverse_obj( - episode_data, 'contextualTitle', 'episodeTitle', expected_type=str) + season_number = traverse_obj(season_data, ('season', {int_or_none})) info_dict.update({ - 'title': episode_title or info_dict.get('title'), - 'series': remove_end(info_dict.get('title'), f' - {episode_title}'), - 'season_number': traverse_obj(season_data, ('season', {int_or_none})), + 'episode': traverse_obj(episode_data, ('episodeTitle', {str})), 'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})), + 'season_number': season_number, + 'series': self._search_regex( + fr'^(.+)? - S{season_number} - ', info_dict.get('title'), 'series', default=None), }) break diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 891bcc8731..b280fb53ab 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -12,8 +12,11 @@ ExtractorError, OnDemandPagedList, determine_ext, + filter_dict, int_or_none, join_nonempty, + jwt_decode_hs256, + parse_iso8601, str_or_none, url_or_none, ) @@ -21,35 +24,48 @@ class HotStarBaseIE(InfoExtractor): + _TOKEN_NAME = 'userUP' _BASE_URL = 'https://www.hotstar.com' _API_URL = 'https://api.hotstar.com' _API_URL_V2 = 'https://apix.hotstar.com/v2' _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' + _FREE_HEADERS = { + 'user-agent': 'Hotstar;in.startv.hotstar/25.06.30.0.11580 (Android/12)', + 'x-hs-client': 'platform:android;app_id:in.startv.hotstar;app_version:25.06.30.0;os:Android;os_version:12;schema_version:0.0.1523', + 'x-hs-platform': 'android', + } + _SUB_HEADERS = { + 'user-agent': 'Disney+;in.startv.hotstar.dplus.tv/23.08.14.4.2915 (Android/13)', + 'x-hs-client': 'platform:androidtv;app_id:in.startv.hotstar.dplus.tv;app_version:23.08.14.4;os:Android;os_version:13;schema_version:0.0.970', + 'x-hs-platform': 'androidtv', + } + + def _has_active_subscription(self, cookies, server_time): + expiry = traverse_obj(cookies, ( + self._TOKEN_NAME, 'value', {jwt_decode_hs256}, 'sub', {json.loads}, + 'subscriptions', 'in', ..., 'expiry', {parse_iso8601}, all, {max})) or 0 + return expiry > server_time + def _call_api_v1(self, path, *args, **kwargs): return self._download_json( f'{self._API_URL}/o/v1/{path}', *args, **kwargs, headers={'x-country-code': 'IN', 'x-platform-code': 'PCTV'}) - def _call_api_impl(self, path, video_id, query, st=None, cookies=None): - if not cookies or not cookies.get('userUP'): - self.raise_login_required() - + def _call_api_impl(self, path, video_id, query, cookies=None, st=None): st = int_or_none(st) or int(time.time()) exp = st + 6000 auth = f'st={st}~exp={exp}~acl=/*' auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() response = self._download_json( f'{self._API_URL_V2}/{path}', video_id, query=query, - headers={ - 'user-agent': 'Disney+;in.startv.hotstar.dplus.tv/23.08.14.4.2915 (Android/13)', + headers=filter_dict({ + **(self._SUB_HEADERS if self._has_active_subscription(cookies, st) else self._FREE_HEADERS), 'hotstarauth': auth, - 'x-hs-usertoken': cookies['userUP'].value, + 'x-hs-usertoken': traverse_obj(cookies, (self._TOKEN_NAME, 'value')), 'x-hs-device-id': traverse_obj(cookies, ('deviceId', 'value')) or str(uuid.uuid4()), - 'x-hs-client': 'platform:androidtv;app_id:in.startv.hotstar.dplus.tv;app_version:23.08.14.4;os:Android;os_version:13;schema_version:0.0.970', - 'x-hs-platform': 'androidtv', 'content-type': 'application/json', - }) + })) if not traverse_obj(response, ('success', {dict})): raise ExtractorError('API call was unsuccessful') @@ -61,21 +77,22 @@ def _call_api_v2(self, path, video_id, content_type, cookies=None, st=None): 'filters': f'content_type={content_type}', 'client_capabilities': json.dumps({ 'package': ['dash', 'hls'], - 'container': ['fmp4br', 'fmp4'], + 'container': ['fmp4', 'fmp4br', 'ts'], 'ads': ['non_ssai', 'ssai'], - 'audio_channel': ['atmos', 'dolby51', 'stereo'], + 'audio_channel': ['stereo', 'dolby51', 'atmos'], 'encryption': ['plain', 'widevine'], # wv only so we can raise appropriate error - 'video_codec': ['h265', 'h264'], - 'ladder': ['tv', 'full'], - 'resolution': ['4k', 'hd'], - 'true_resolution': ['4k', 'hd'], - 'dynamic_range': ['hdr', 'sdr'], + 'video_codec': ['h264', 'h265'], + 'video_codec_non_secure': ['h264', 'h265', 'vp9'], + 'ladder': ['phone', 'tv', 'full'], + 'resolution': ['hd', '4k'], + 'true_resolution': ['hd', '4k'], + 'dynamic_range': ['sdr', 'hdr'], }, separators=(',', ':')), 'drm_parameters': json.dumps({ 'widevine_security_level': ['SW_SECURE_DECODE', 'SW_SECURE_CRYPTO'], 'hdcp_version': ['HDCP_V2_2', 'HDCP_V2_1', 'HDCP_V2', 'HDCP_V1'], }, separators=(',', ':')), - }, st=st, cookies=cookies) + }, cookies=cookies, st=st) @staticmethod def _parse_metadata_v1(video_data): @@ -274,6 +291,8 @@ def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') video_type = self._TYPE[video_type] cookies = self._get_cookies(url) # Cookies before any request + if not cookies or not cookies.get(self._TOKEN_NAME): + self.raise_login_required() video_data = traverse_obj( self._call_api_v1(f'{video_type}/detail', video_id, fatal=False, query={ @@ -292,7 +311,7 @@ def _real_extract(self, url): # See https://github.com/yt-dlp/yt-dlp/issues/396 st = self._request_webpage( f'{self._BASE_URL}/in', video_id, 'Fetching server time').get_header('x-origin-date') - watch = self._call_api_v2('pages/watch', video_id, content_type, cookies=cookies, st=st) + watch = self._call_api_v2('pages/watch', video_id, content_type, cookies, st) player_config = traverse_obj(watch, ( 'page', 'spaces', 'player', 'widget_wrappers', lambda _, v: v['template'] == 'PlayerWidget', 'widget', 'data', 'player_config', {dict}, any, {require('player config')})) @@ -364,10 +383,13 @@ def _real_extract(self, url): formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) - if not formats and geo_restricted: - self.raise_geo_restricted(countries=['IN'], metadata_available=True) - elif not formats and has_drm: - self.report_drm(video_id) + if not formats: + if geo_restricted: + self.raise_geo_restricted(countries=['IN'], metadata_available=True) + elif has_drm: + self.report_drm(video_id) + elif not self._has_active_subscription(cookies, st): + self.raise_no_formats('Your account does not have access to this content', expected=True) self._remove_duplicate_formats(formats) for f in formats: f.setdefault('http_headers', {}).update(headers) diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py deleted file mode 100644 index 7a91d4a235..0000000000 --- a/yt_dlp/extractor/joqrag.py +++ /dev/null @@ -1,112 +0,0 @@ -import datetime as dt -import urllib.parse - -from .common import InfoExtractor -from ..utils import ( - clean_html, - datetime_from_str, - unified_timestamp, - urljoin, -) - - -class JoqrAgIE(InfoExtractor): - IE_DESC = '超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR)' - _VALID_URL = [r'https?://www\.uniqueradio\.jp/agplayer5/(?:player|inc-player-hls)\.php', - r'https?://(?:www\.)?joqr\.co\.jp/ag/', - r'https?://(?:www\.)?joqr\.co\.jp/qr/ag(?:daily|regular)program/?(?:$|[#?])'] - _TESTS = [{ - 'url': 'https://www.uniqueradio.jp/agplayer5/player.php', - 'info_dict': { - 'id': 'live', - 'title': str, - 'channel': '超!A&G+', - 'description': str, - 'live_status': 'is_live', - 'release_timestamp': int, - }, - 'params': { - 'skip_download': True, - 'ignore_no_formats_error': True, - }, - }, { - 'url': 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', - 'only_matching': True, - }, { - 'url': 'https://www.joqr.co.jp/ag/article/103760/', - 'only_matching': True, - }, { - 'url': 'http://www.joqr.co.jp/qr/agdailyprogram/', - 'only_matching': True, - }, { - 'url': 'http://www.joqr.co.jp/qr/agregularprogram/', - 'only_matching': True, - }] - - def _extract_metadata(self, variable, html): - return clean_html(urllib.parse.unquote_plus(self._search_regex( - rf'var\s+{variable}\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - html, 'metadata', group='value', default=''))) or None - - def _extract_start_timestamp(self, video_id, is_live): - def extract_start_time_from(date_str): - dt_ = datetime_from_str(date_str) + dt.timedelta(hours=9) - date = dt_.strftime('%Y%m%d') - start_time = self._search_regex( - r']+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+–\s*(\d{1,2}:\d{1,2})', - self._download_webpage( - f'https://www.joqr.co.jp/qr/agdailyprogram/?date={date}', video_id, - note=f'Downloading program list of {date}', fatal=False, - errnote=f'Failed to download program list of {date}') or '', - 'start time', default=None) - if start_time: - return unified_timestamp(f'{dt_.strftime("%Y/%m/%d")} {start_time} +09:00') - return None - - start_timestamp = extract_start_time_from('today') - if not start_timestamp: - return None - - if not is_live or start_timestamp < datetime_from_str('now').timestamp(): - return start_timestamp - else: - return extract_start_time_from('yesterday') - - def _real_extract(self, url): - video_id = 'live' - - metadata = self._download_webpage( - 'https://www.uniqueradio.jp/aandg', video_id, - note='Downloading metadata', errnote='Failed to download metadata') - title = self._extract_metadata('Program_name', metadata) - - if not title or title == '放送休止': - formats = [] - live_status = 'is_upcoming' - release_timestamp = self._extract_start_timestamp(video_id, False) - msg = 'This stream is not currently live' - if release_timestamp: - msg += (' and will start at ' - + dt.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) - self.raise_no_formats(msg, expected=True) - else: - m3u8_path = self._search_regex( - r']*\bsrc="([^"]+)"', - self._download_webpage( - 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', video_id, - note='Downloading player data', errnote='Failed to download player data'), - 'm3u8 url') - formats = self._extract_m3u8_formats( - urljoin('https://www.uniqueradio.jp/', m3u8_path), video_id) - live_status = 'is_live' - release_timestamp = self._extract_start_timestamp(video_id, True) - - return { - 'id': video_id, - 'title': title, - 'channel': '超!A&G+', - 'description': self._extract_metadata('Program_text', metadata), - 'formats': formats, - 'live_status': live_status, - 'release_timestamp': release_timestamp, - } diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py deleted file mode 100644 index 763a01448c..0000000000 --- a/yt_dlp/extractor/limelight.py +++ /dev/null @@ -1,358 +0,0 @@ -import re - -from .common import InfoExtractor -from ..networking.exceptions import HTTPError -from ..utils import ( - ExtractorError, - determine_ext, - float_or_none, - int_or_none, - smuggle_url, - try_get, - unsmuggle_url, -) - - -class LimelightBaseIE(InfoExtractor): - _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' - - @classmethod - def _extract_embed_urls(cls, url, webpage): - lm = { - 'Media': 'media', - 'Channel': 'channel', - 'ChannelList': 'channel_list', - } - - def smuggle(url): - return smuggle_url(url, {'source_url': url}) - - entries = [] - for kind, video_id in re.findall( - r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P[a-z0-9]{32})', - webpage): - entries.append(cls.url_result( - smuggle(f'limelight:{lm[kind]}:{video_id}'), - f'Limelight{kind}', video_id)) - for mobj in re.finditer( - # As per [1] class attribute should be exactly equal to - # LimelightEmbeddedPlayerFlash but numerous examples seen - # that don't exactly match it (e.g. [2]). - # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage - # 2. http://www.sedona.com/FacilitatorTraining2017 - r'''(?sx) - ]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*? - ]+ - name=(["\'])flashVars\2[^>]+ - value=(["\'])(?:(?!\3).)*(?Pmedia|channel(?:List)?)Id=(?P[a-z0-9]{32}) - ''', webpage): - kind, video_id = mobj.group('kind'), mobj.group('id') - entries.append(cls.url_result( - smuggle(f'limelight:{kind}:{video_id}'), - f'Limelight{kind.capitalize()}', video_id)) - # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page) - for video_id in re.findall( - r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P[a-z0-9]{32})', - webpage): - entries.append(cls.url_result( - smuggle(f'limelight:media:{video_id}'), - LimelightMediaIE.ie_key(), video_id)) - return entries - - def _call_playlist_service(self, item_id, method, fatal=True, referer=None): - headers = {} - if referer: - headers['Referer'] = referer - try: - return self._download_json( - self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), - item_id, f'Downloading PlaylistService {method} JSON', - fatal=fatal, headers=headers) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 403: - error = self._parse_json(e.cause.response.read().decode(), item_id)['detail']['contentAccessPermission'] - if error == 'CountryDisabled': - self.raise_geo_restricted() - raise ExtractorError(error, expected=True) - raise - - def _extract(self, item_id, pc_method, mobile_method, referer=None): - pc = self._call_playlist_service(item_id, pc_method, referer=referer) - mobile = self._call_playlist_service( - item_id, mobile_method, fatal=False, referer=referer) - return pc, mobile - - def _extract_info(self, pc, mobile, i, referer): - get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {} - pc_item = get_item(pc, 'playlistItems') - mobile_item = get_item(mobile, 'mediaList') - video_id = pc_item.get('mediaId') or mobile_item['mediaId'] - title = pc_item.get('title') or mobile_item['title'] - - formats = [] - urls = [] - for stream in pc_item.get('streams', []): - stream_url = stream.get('url') - if not stream_url or stream_url in urls: - continue - if not self.get_param('allow_unplayable_formats') and stream.get('drmProtected'): - continue - urls.append(stream_url) - ext = determine_ext(stream_url) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream_url, video_id, f4m_id='hds', fatal=False)) - else: - fmt = { - 'url': stream_url, - 'abr': float_or_none(stream.get('audioBitRate')), - 'fps': float_or_none(stream.get('videoFrameRate')), - 'ext': ext, - } - width = int_or_none(stream.get('videoWidthInPixels')) - height = int_or_none(stream.get('videoHeightInPixels')) - vbr = float_or_none(stream.get('videoBitRate')) - if width or height or vbr: - fmt.update({ - 'width': width, - 'height': height, - 'vbr': vbr, - }) - else: - fmt['vcodec'] = 'none' - rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', stream_url) - if rtmp: - format_id = 'rtmp' - if stream.get('videoBitRate'): - format_id += '-%d' % int_or_none(stream['videoBitRate']) - http_format_id = format_id.replace('rtmp', 'http') - - CDN_HOSTS = ( - ('delvenetworks.com', 'cpl.delvenetworks.com'), - ('video.llnw.net', 's2.content.video.llnw.net'), - ) - for cdn_host, http_host in CDN_HOSTS: - if cdn_host not in rtmp.group('host').lower(): - continue - http_url = 'http://{}/{}'.format(http_host, rtmp.group('playpath')[4:]) - urls.append(http_url) - if self._is_valid_url(http_url, video_id, http_format_id): - http_fmt = fmt.copy() - http_fmt.update({ - 'url': http_url, - 'format_id': http_format_id, - }) - formats.append(http_fmt) - break - - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': format_id, - }) - formats.append(fmt) - - for mobile_url in mobile_item.get('mobileUrls', []): - media_url = mobile_url.get('mobileUrl') - format_id = mobile_url.get('targetMediaPlatform') - if not media_url or media_url in urls: - continue - if (format_id in ('Widevine', 'SmoothStreaming') - and not self.get_param('allow_unplayable_formats', False)): - continue - urls.append(media_url) - ext = determine_ext(media_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': media_url, - 'format_id': format_id, - 'quality': -10, - 'ext': ext, - }) - - subtitles = {} - for flag in mobile_item.get('flags'): - if flag == 'ClosedCaptions': - closed_captions = self._call_playlist_service( - video_id, 'getClosedCaptionsDetailsByMediaId', - False, referer) or [] - for cc in closed_captions: - cc_url = cc.get('webvttFileUrl') - if not cc_url: - continue - lang = cc.get('languageCode') or self._search_regex(r'/([a-z]{2})\.vtt', cc_url, 'lang', default='en') - subtitles.setdefault(lang, []).append({ - 'url': cc_url, - }) - break - - get_meta = lambda x: pc_item.get(x) or mobile_item.get(x) - - return { - 'id': video_id, - 'title': title, - 'description': get_meta('description'), - 'formats': formats, - 'duration': float_or_none(get_meta('durationInMilliseconds'), 1000), - 'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'), - 'subtitles': subtitles, - } - - -class LimelightMediaIE(LimelightBaseIE): - IE_NAME = 'limelight' - _VALID_URL = r'''(?x) - (?: - limelight:media:| - https?:// - (?: - link\.videoplatform\.limelight\.com/media/| - assets\.delvenetworks\.com/player/loader\.swf - ) - \?.*?\bmediaId= - ) - (?P[a-z0-9]{32}) - ''' - _TESTS = [{ - 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', - 'info_dict': { - 'id': '3ffd040b522b4485b6d84effc750cd86', - 'ext': 'mp4', - 'title': 'HaP and the HB Prince Trailer', - 'description': 'md5:8005b944181778e313d95c1237ddb640', - 'thumbnail': r're:^https?://.*\.jpeg$', - 'duration': 144.23, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # video with subtitles - 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', - 'md5': '2fa3bad9ac321e23860ca23bc2c69e3d', - 'info_dict': { - 'id': 'a3e00274d4564ec4a9b29b9466432335', - 'ext': 'mp4', - 'title': '3Play Media Overview Video', - 'thumbnail': r're:^https?://.*\.jpeg$', - 'duration': 78.101, - # TODO: extract all languages that were accessible via API - # 'subtitles': 'mincount:9', - 'subtitles': 'mincount:1', - }, - }, { - 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', - 'only_matching': True, - }] - _PLAYLIST_SERVICE_PATH = 'media' - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - video_id = self._match_id(url) - source_url = smuggled_data.get('source_url') - self._initialize_geo_bypass({ - 'countries': smuggled_data.get('geo_countries'), - }) - - pc, mobile = self._extract( - video_id, 'getPlaylistByMediaId', - 'getMobilePlaylistByMediaId', source_url) - - return self._extract_info(pc, mobile, 0, source_url) - - -class LimelightChannelIE(LimelightBaseIE): - IE_NAME = 'limelight:channel' - _VALID_URL = r'''(?x) - (?: - limelight:channel:| - https?:// - (?: - link\.videoplatform\.limelight\.com/media/| - assets\.delvenetworks\.com/player/loader\.swf - ) - \?.*?\bchannelId= - ) - (?P[a-z0-9]{32}) - ''' - _TESTS = [{ - 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', - 'info_dict': { - 'id': 'ab6a524c379342f9b23642917020c082', - 'title': 'Javascript Sample Code', - 'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html', - }, - 'playlist_mincount': 3, - }, { - 'url': 'http://assets.delvenetworks.com/player/loader.swf?channelId=ab6a524c379342f9b23642917020c082', - 'only_matching': True, - }] - _PLAYLIST_SERVICE_PATH = 'channel' - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - channel_id = self._match_id(url) - source_url = smuggled_data.get('source_url') - - pc, mobile = self._extract( - channel_id, 'getPlaylistByChannelId', - 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', - source_url) - - entries = [ - self._extract_info(pc, mobile, i, source_url) - for i in range(len(pc['playlistItems']))] - - return self.playlist_result( - entries, channel_id, pc.get('title'), mobile.get('description')) - - -class LimelightChannelListIE(LimelightBaseIE): - IE_NAME = 'limelight:channel_list' - _VALID_URL = r'''(?x) - (?: - limelight:channel_list:| - https?:// - (?: - link\.videoplatform\.limelight\.com/media/| - assets\.delvenetworks\.com/player/loader\.swf - ) - \?.*?\bchannelListId= - ) - (?P[a-z0-9]{32}) - ''' - _TESTS = [{ - 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', - 'info_dict': { - 'id': '301b117890c4465c8179ede21fd92e2b', - 'title': 'Website - Hero Player', - }, - 'playlist_mincount': 2, - }, { - 'url': 'https://assets.delvenetworks.com/player/loader.swf?channelListId=301b117890c4465c8179ede21fd92e2b', - 'only_matching': True, - }] - _PLAYLIST_SERVICE_PATH = 'channel_list' - - def _real_extract(self, url): - channel_list_id = self._match_id(url) - - channel_list = self._call_playlist_service( - channel_list_id, 'getMobileChannelListById') - - entries = [ - self.url_result('limelight:channel:{}'.format(channel['id']), 'LimelightChannel') - for channel in channel_list['channelList']] - - return self.playlist_result( - entries, channel_list_id, channel_list['title']) diff --git a/yt_dlp/extractor/lrt.py b/yt_dlp/extractor/lrt.py index caff9125e0..34c9ece2d1 100644 --- a/yt_dlp/extractor/lrt.py +++ b/yt_dlp/extractor/lrt.py @@ -134,7 +134,7 @@ class LRTRadioIE(LRTBaseIE): def _real_extract(self, url): video_id, path = self._match_valid_url(url).group('id', 'path') media = self._download_json( - 'https://www.lrt.lt/radioteka/api/media', video_id, + 'https://www.lrt.lt/rest-api/media', video_id, query={'url': f'/mediateka/irasas/{video_id}/{path}'}) return { diff --git a/yt_dlp/extractor/mixlr.py b/yt_dlp/extractor/mixlr.py new file mode 100644 index 0000000000..53f3ffe6f8 --- /dev/null +++ b/yt_dlp/extractor/mixlr.py @@ -0,0 +1,134 @@ +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import int_or_none, parse_iso8601, url_or_none, urlhandle_detect_ext +from ..utils.traversal import traverse_obj + + +class MixlrIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P[\w-]+)\.mixlr\.com/events/(?P\d+)' + _TESTS = [{ + 'url': 'https://suncity-104-9fm.mixlr.com/events/4387115', + 'info_dict': { + 'id': '4387115', + 'ext': 'mp3', + 'title': r're:SUNCITY 104.9FM\'s live audio \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'uploader': 'suncity-104-9fm', + 'like_count': int, + 'thumbnail': r're:https://imagecdn\.mixlr\.com/cdn-cgi/image/[^/?#]+/cd5b34d05fa2cee72d80477724a2f02e.png', + 'timestamp': 1751943773, + 'upload_date': '20250708', + 'release_timestamp': 1751943764, + 'release_date': '20250708', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://brcountdown.mixlr.com/events/4395480', + 'info_dict': { + 'id': '4395480', + 'ext': 'aac', + 'title': r're:Beats Revolution Countdown Episodio 461 \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'description': 'md5:5cacd089723f7add3f266bd588315bb3', + 'uploader': 'brcountdown', + 'like_count': int, + 'thumbnail': r're:https://imagecdn\.mixlr\.com/cdn-cgi/image/[^/?#]+/c48727a59f690b87a55d47d123ba0d6d.jpg', + 'timestamp': 1752354007, + 'upload_date': '20250712', + 'release_timestamp': 1752354000, + 'release_date': '20250712', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://www.brcountdown.mixlr.com/events/4395480', + 'only_matching': True, + }] + + def _real_extract(self, url): + username, event_id = self._match_valid_url(url).group('username', 'id') + + broadcast_info = self._download_json( + f'https://api.mixlr.com/v3/channels/{username}/events/{event_id}', event_id) + + formats = [] + format_url = traverse_obj( + broadcast_info, ('included', 0, 'attributes', 'progressive_stream_url', {url_or_none})) + if format_url: + urlh = self._request_webpage( + HEADRequest(format_url), event_id, fatal=False, note='Checking stream') + if urlh and urlh.status == 200: + ext = urlhandle_detect_ext(urlh) + if ext == 'octet-stream': + self.report_warning( + 'The server did not return a valid file extension for the stream URL. ' + 'Assuming an mp3 stream; postprocessing may fail if this is incorrect') + ext = 'mp3' + formats.append({ + 'url': format_url, + 'ext': ext, + 'vcodec': 'none', + }) + + release_timestamp = traverse_obj( + broadcast_info, ('data', 'attributes', 'starts_at', {str})) + if not formats and release_timestamp: + self.raise_no_formats(f'This event will start at {release_timestamp}', expected=True) + + return { + 'id': event_id, + 'uploader': username, + 'formats': formats, + 'release_timestamp': parse_iso8601(release_timestamp), + **traverse_obj(broadcast_info, ('included', 0, 'attributes', { + 'title': ('title', {str}), + 'timestamp': ('started_at', {parse_iso8601}), + 'concurrent_view_count': ('concurrent_view_count', {int_or_none}), + 'like_count': ('heart_count', {int_or_none}), + 'is_live': ('live', {bool}), + })), + **traverse_obj(broadcast_info, ('data', 'attributes', { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('started_at', {parse_iso8601}), + 'concurrent_view_count': ('concurrent_view_count', {int_or_none}), + 'like_count': ('heart_count', {int_or_none}), + 'thumbnail': ('artwork_url', {url_or_none}), + 'uploader_id': ('broadcaster_id', {str}), + })), + } + + +class MixlrRecoringIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P[\w-]+)\.mixlr\.com/recordings/(?P\d+)' + _TESTS = [{ + 'url': 'https://biblewayng.mixlr.com/recordings/2375193', + 'info_dict': { + 'id': '2375193', + 'ext': 'mp3', + 'title': "God's Jewels and Their Resting Place Bro. Adeniji", + 'description': 'Preached February 21, 2024 in the evening', + 'uploader_id': '8659190', + 'duration': 10968, + 'thumbnail': r're:https://imagecdn\.mixlr\.com/cdn-cgi/image/[^/?#]+/ceca120ef707f642abeea6e29cd74238.jpg', + 'timestamp': 1708544542, + 'upload_date': '20240221', + }, + }] + + def _real_extract(self, url): + username, recording_id = self._match_valid_url(url).group('username', 'id') + + recording_info = self._download_json( + f'https://api.mixlr.com/v3/channels/{username}/recordings/{recording_id}', recording_id) + + return { + 'id': recording_id, + **traverse_obj(recording_info, ('data', 'attributes', { + 'ext': ('file_format', {str}), + 'url': ('url', {url_or_none}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('created_at', {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('artwork_url', {url_or_none}), + 'uploader_id': ('user_id', {str}), + })), + } diff --git a/yt_dlp/extractor/ninenow.py b/yt_dlp/extractor/ninenow.py index 7b0cb77a74..2f3a4ed284 100644 --- a/yt_dlp/extractor/ninenow.py +++ b/yt_dlp/extractor/ninenow.py @@ -1,6 +1,3 @@ -import json -import re - from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..utils import ( @@ -11,7 +8,12 @@ str_or_none, url_or_none, ) -from ..utils.traversal import require, traverse_obj, value +from ..utils.traversal import ( + get_first, + require, + traverse_obj, + value, +) class NineNowIE(InfoExtractor): @@ -101,20 +103,11 @@ class NineNowIE(InfoExtractor): }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId={}' - # XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv and yt_dlp.extractor.goplay - def _find_json(self, s): - return self._search_json( - r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None) - def _real_extract(self, url): display_id, video_type = self._match_valid_url(url).group('id', 'type') webpage = self._download_webpage(url, display_id) - common_data = traverse_obj( - re.findall(r']*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*', webpage), - (..., {json.loads}, ..., {self._find_json}, - lambda _, v: v['payload'][video_type]['slug'] == display_id, - 'payload', any, {require('video data')})) + common_data = get_first(self._search_nextjs_v13_data(webpage, display_id), ('payload', {dict})) if traverse_obj(common_data, (video_type, 'video', 'drm', {bool})): self.report_drm(display_id) diff --git a/yt_dlp/extractor/noovo.py b/yt_dlp/extractor/noovo.py deleted file mode 100644 index 772d4ed9e0..0000000000 --- a/yt_dlp/extractor/noovo.py +++ /dev/null @@ -1,100 +0,0 @@ -from .brightcove import BrightcoveNewIE -from .common import InfoExtractor -from ..utils import ( - int_or_none, - js_to_json, - smuggle_url, - try_get, -) - - -class NoovoIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P[^/]+/[^/?#&]+)' - _TESTS = [{ - # clip - 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial', - 'info_dict': { - 'id': '5386045029001', - 'ext': 'mp4', - 'title': 'Chrysler Imperial', - 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056', - 'timestamp': 1491399228, - 'upload_date': '20170405', - 'uploader_id': '618566855001', - 'series': 'RPM+', - }, - 'params': { - 'skip_download': True, - }, - }, { - # episode - 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8', - 'info_dict': { - 'id': '5395865725001', - 'title': 'Épisode 13 : Les retrouvailles', - 'description': 'md5:888c3330f0c1b4476c5bc99a1c040473', - 'ext': 'mp4', - 'timestamp': 1492019320, - 'upload_date': '20170412', - 'uploader_id': '618566855001', - 'series': "L'amour est dans le pré", - 'season_number': 5, - 'episode': 'Épisode 13', - 'episode_number': 13, - }, - 'params': { - 'skip_download': True, - }, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - brightcove_id = self._search_regex( - r'data-video-id=["\'](\d+)', webpage, 'brightcove id') - - data = self._parse_json( - self._search_regex( - r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data', - default='{}'), - video_id, transform_source=js_to_json, fatal=False) - - title = try_get( - data, lambda x: x['video']['nom'], - str) or self._html_search_meta( - 'dcterms.Title', webpage, 'title', fatal=True) - - description = self._html_search_meta( - ('dcterms.Description', 'description'), webpage, 'description') - - series = try_get( - data, lambda x: x['emission']['nom']) or self._search_regex( - r']+class="banner-card__subtitle h4"[^>]*>([^<]+)', - webpage, 'series', default=None) - - season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {} - season = try_get(season_el, lambda x: x['nom'], str) - season_number = int_or_none(try_get(season_el, lambda x: x['numero'])) - - episode_el = try_get(season_el, lambda x: x['episode'], dict) or {} - episode = try_get(episode_el, lambda x: x['nom'], str) - episode_number = int_or_none(try_get(episode_el, lambda x: x['numero'])) - - return { - '_type': 'url_transparent', - 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['CA']}), - 'id': brightcove_id, - 'title': title, - 'description': description, - 'series': series, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - } diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index c489dc7312..027f7a7b6f 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -765,7 +765,7 @@ class RaiCulturaIE(RaiNewsIE): # XXX: Do not subclass from concrete IE class RaiSudtirolIE(RaiBaseIE): - _VALID_URL = r'https?://raisudtirol\.rai\.it/.+media=(?P\w+)' + _VALID_URL = r'https?://rai(?:bz|sudtirol)\.rai\.it/.+media=(?P\w+)' _TESTS = [{ # mp4 file 'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460', @@ -791,6 +791,9 @@ class RaiSudtirolIE(RaiBaseIE): 'formats': 'count:6', }, 'params': {'skip_download': True}, + }, { + 'url': 'https://raibz.rai.it/de/index.php?media=Ptv1751660400', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tfo.py b/yt_dlp/extractor/tfo.py index 0d1b252175..1884ab2e8e 100644 --- a/yt_dlp/extractor/tfo.py +++ b/yt_dlp/extractor/tfo.py @@ -6,6 +6,7 @@ class TFOIE(InfoExtractor): + _WORKING = False _GEO_COUNTRIES = ['CA'] _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P\d+)' _TEST = { diff --git a/yt_dlp/extractor/tv5unis.py b/yt_dlp/extractor/tv5unis.py index 88fd334822..fe7fd0325b 100644 --- a/yt_dlp/extractor/tv5unis.py +++ b/yt_dlp/extractor/tv5unis.py @@ -51,6 +51,7 @@ def _real_extract(self, url): class TV5UnisVideoIE(TV5UnisBaseIE): + _WORKING = False IE_NAME = 'tv5unis:video' _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/[^/]+/(?P\d+)' _TEST = { @@ -71,6 +72,7 @@ def _gql_args(groups): class TV5UnisIE(TV5UnisBaseIE): + _WORKING = False IE_NAME = 'tv5unis' _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P[^/]+)(?:/saisons/(?P\d+)/episodes/(?P\d+))?/?(?:[?#&]|$)' _TESTS = [{ diff --git a/yt_dlp/extractor/unitednations.py b/yt_dlp/extractor/unitednations.py new file mode 100644 index 0000000000..f9283fd6c1 --- /dev/null +++ b/yt_dlp/extractor/unitednations.py @@ -0,0 +1,32 @@ +from .common import InfoExtractor +from .kaltura import KalturaIE + + +class UnitedNationsWebTvIE(InfoExtractor): + _VALID_URL = r'https?://webtv\.un\.org/(?:ar|zh|en|fr|ru|es)/asset/\w+/(?P\w+)' + _TESTS = [{ + 'url': 'https://webtv.un.org/en/asset/k1o/k1o7stmi6p', + 'md5': 'b2f8b3030063298ae841b4b7ddc01477', + 'info_dict': { + 'id': '1_o7stmi6p', + 'ext': 'mp4', + 'title': 'António Guterres (Secretary-General) on Israel and Iran - Security Council, 9939th meeting', + 'thumbnail': 'http://cfvod.kaltura.com/p/2503451/sp/250345100/thumbnail/entry_id/1_o7stmi6p/version/100021', + 'uploader_id': 'evgeniia.alisova@un.org', + 'upload_date': '20250620', + 'timestamp': 1750430976, + 'duration': 234, + 'view_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + partner_id = self._html_search_regex( + r'partnerId:\s*(\d+)', webpage, 'partner_id') + entry_id = self._html_search_regex( + r'const\s+kentryID\s*=\s*["\'](\w+)["\']', webpage, 'kentry_id') + + return self.url_result(f'kaltura:{partner_id}:{entry_id}', KalturaIE) diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index 31393b02a4..05ae4dd18a 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -53,6 +53,10 @@ class KnownDRMIE(UnsupportedInfoExtractor): r'(?:beta\.)?crunchyroll\.com', r'viki\.com', r'deezer\.com', + r'b-ch\.com', + r'ctv\.ca', + r'noovo\.ca', + r'tsn\.ca', ) _TESTS = [{ @@ -168,6 +172,18 @@ class KnownDRMIE(UnsupportedInfoExtractor): }, { 'url': 'http://www.deezer.com/playlist/176747451', 'only_matching': True, + }, { + 'url': 'https://www.b-ch.com/titles/8203/001', + 'only_matching': True, + }, { + 'url': 'https://www.ctv.ca/shows/masterchef-53506/the-audition-battles-s15e1', + 'only_matching': True, + }, { + 'url': 'https://www.noovo.ca/emissions/lamour-est-dans-le-pre/prets-pour-lamour-s10e1', + 'only_matching': True, + }, { + 'url': 'https://www.tsn.ca/video/relaxed-oilers-look-to-put-emotional-game-2-loss-in-the-rearview%7E3148747', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index b268fad56d..7ffe89f227 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -21,6 +21,7 @@ js_to_json, jwt_decode_hs256, merge_dicts, + mimetype2ext, parse_filesize, parse_iso8601, parse_qs, @@ -28,9 +29,11 @@ smuggle_url, str_or_none, traverse_obj, + try_call, try_get, unified_timestamp, unsmuggle_url, + url_basename, url_or_none, urlencode_postdata, urlhandle_detect_ext, @@ -45,14 +48,57 @@ class VimeoBaseInfoExtractor(InfoExtractor): _REFERER_HINT = ( 'Cannot download embed-only video without embedding URL. Please call yt-dlp ' 'with the URL of the page that embeds this video.') - _IOS_CLIENT_AUTH = 'MTMxNzViY2Y0NDE0YTQ5YzhjZTc0YmU0NjVjNDQxYzNkYWVjOWRlOTpHKzRvMmgzVUh4UkxjdU5FRW80cDNDbDhDWGR5dVJLNUJZZ055dHBHTTB4V1VzaG41bEx1a2hiN0NWYWNUcldSSW53dzRUdFRYZlJEZmFoTTArOTBUZkJHS3R4V2llYU04Qnl1bERSWWxUdXRidjNqR2J4SHFpVmtFSUcyRktuQw==' - _IOS_CLIENT_HEADERS = { + + _DEFAULT_CLIENT = 'android' + _DEFAULT_AUTHED_CLIENT = 'web' + _CLIENT_HEADERS = { 'Accept': 'application/vnd.vimeo.*+json; version=3.4.10', 'Accept-Language': 'en', - 'User-Agent': 'Vimeo/11.10.0 (com.vimeo; build:250424.164813.0; iOS 18.4.1) Alamofire/5.9.0 VimeoNetworking/5.0.0', } - _IOS_OAUTH_CACHE_KEY = 'oauth-token-ios' - _ios_oauth_token = None + _CLIENT_CONFIGS = { + 'android': { + 'CACHE_KEY': 'oauth-token-android', + 'CACHE_ONLY': False, + 'VIEWER_JWT': False, + 'REQUIRES_AUTH': False, + 'AUTH': 'NzRmYTg5YjgxMWExY2JiNzUwZDg1MjhkMTYzZjQ4YWYyOGEyZGJlMTp4OGx2NFd3QnNvY1lkamI2UVZsdjdDYlNwSDUrdm50YzdNNThvWDcwN1JrenJGZC9tR1lReUNlRjRSVklZeWhYZVpRS0tBcU9YYzRoTGY2Z1dlVkJFYkdJc0dMRHpoZWFZbU0reDRqZ1dkZ1diZmdIdGUrNUM5RVBySlM0VG1qcw==', + 'USER_AGENT': 'com.vimeo.android.videoapp (OnePlus, ONEPLUS A6003, OnePlus, Android 14/34 Version 11.8.1) Kotlin VimeoNetworking/3.12.0', + 'VIDEOS_FIELDS': ( + 'uri', 'name', 'description', 'type', 'link', 'player_embed_url', 'duration', 'width', + 'language', 'height', 'embed', 'created_time', 'modified_time', 'release_time', 'content_rating', + 'content_rating_class', 'rating_mod_locked', 'license', 'privacy', 'pictures', 'tags', 'stats', + 'categories', 'uploader', 'metadata', 'user', 'files', 'download', 'app', 'play', 'status', + 'resource_key', 'badge', 'upload', 'transcode', 'is_playable', 'has_audio', + ), + }, + 'ios': { + 'CACHE_KEY': 'oauth-token-ios', + 'CACHE_ONLY': True, + 'VIEWER_JWT': False, + 'REQUIRES_AUTH': False, + 'AUTH': 'MTMxNzViY2Y0NDE0YTQ5YzhjZTc0YmU0NjVjNDQxYzNkYWVjOWRlOTpHKzRvMmgzVUh4UkxjdU5FRW80cDNDbDhDWGR5dVJLNUJZZ055dHBHTTB4V1VzaG41bEx1a2hiN0NWYWNUcldSSW53dzRUdFRYZlJEZmFoTTArOTBUZkJHS3R4V2llYU04Qnl1bERSWWxUdXRidjNqR2J4SHFpVmtFSUcyRktuQw==', + 'USER_AGENT': 'Vimeo/11.10.0 (com.vimeo; build:250424.164813.0; iOS 18.4.1) Alamofire/5.9.0 VimeoNetworking/5.0.0', + 'VIDEOS_FIELDS': ( + 'uri', 'name', 'description', 'type', 'link', 'player_embed_url', 'duration', + 'width', 'language', 'height', 'embed', 'created_time', 'modified_time', 'release_time', + 'content_rating', 'content_rating_class', 'rating_mod_locked', 'license', 'config_url', + 'embed_player_config_url', 'privacy', 'pictures', 'tags', 'stats', 'categories', 'uploader', + 'metadata', 'user', 'files', 'download', 'app', 'play', 'status', 'resource_key', 'badge', + 'upload', 'transcode', 'is_playable', 'has_audio', + ), + }, + 'web': { + 'VIEWER_JWT': True, + 'REQUIRES_AUTH': True, + 'USER_AGENT': None, + 'VIDEOS_FIELDS': ( + 'config_url', 'created_time', 'description', 'license', + 'metadata.connections.comments.total', 'metadata.connections.likes.total', + 'release_time', 'stats.plays', + ), + }, + } + _oauth_tokens = {} _viewer_info = None @staticmethod @@ -80,7 +126,14 @@ def _fetch_viewer_info(self, display_id=None, fatal=True): return self._viewer_info + @property + def _is_logged_in(self): + return 'vimeo' in self._get_cookies('https://vimeo.com') + def _perform_login(self, username, password): + if self._is_logged_in: + return + viewer = self._fetch_viewer_info() data = { 'action': 'login', @@ -105,8 +158,8 @@ def _perform_login(self, username, password): raise ExtractorError('Unable to log in') def _real_initialize(self): - if self._LOGIN_REQUIRED and not self._get_cookies('https://vimeo.com').get('vuid'): - self._raise_login_required() + if self._LOGIN_REQUIRED and not self._is_logged_in: + self.raise_login_required() def _get_video_password(self): password = self.get_param('videopassword') @@ -277,52 +330,95 @@ def _parse_config(self, config, video_id): '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _fetch_oauth_token(self): - if not self._ios_oauth_token: - self._ios_oauth_token = self.cache.load(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY) + def _fetch_oauth_token(self, client): + client_config = self._CLIENT_CONFIGS[client] - if not self._ios_oauth_token: - self._ios_oauth_token = self._download_json( + if client_config['VIEWER_JWT']: + return f'jwt {self._fetch_viewer_info()["jwt"]}' + + cache_key = client_config['CACHE_KEY'] + + if not self._oauth_tokens.get(cache_key): + self._oauth_tokens[cache_key] = self.cache.load(self._NETRC_MACHINE, cache_key) + + if not self._oauth_tokens.get(cache_key): + if client_config['CACHE_ONLY']: + raise ExtractorError( + f'The {client} client is unable to fetch new OAuth tokens ' + f'and is only intended for use with previously cached tokens', expected=True) + + self._oauth_tokens[cache_key] = self._download_json( 'https://api.vimeo.com/oauth/authorize/client', None, - 'Fetching OAuth token', 'Failed to fetch OAuth token', + f'Fetching {client} OAuth token', f'Failed to fetch {client} OAuth token', headers={ - 'Authorization': f'Basic {self._IOS_CLIENT_AUTH}', - **self._IOS_CLIENT_HEADERS, + 'Authorization': f'Basic {client_config["AUTH"]}', + 'User-Agent': client_config['USER_AGENT'], + **self._CLIENT_HEADERS, }, data=urlencode_postdata({ 'grant_type': 'client_credentials', - 'scope': 'private public create edit delete interact upload purchased stats', + 'scope': 'private public create edit delete interact upload purchased stats video_files', }, quote_via=urllib.parse.quote))['access_token'] - self.cache.store(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY, self._ios_oauth_token) + self.cache.store(self._NETRC_MACHINE, cache_key, self._oauth_tokens[cache_key]) - return self._ios_oauth_token + return f'Bearer {self._oauth_tokens[cache_key]}' + + def _get_requested_client(self): + default_client = self._DEFAULT_AUTHED_CLIENT if self._is_logged_in else self._DEFAULT_CLIENT + + client = self._configuration_arg('client', [default_client], ie_key=VimeoIE)[0] + if client not in self._CLIENT_CONFIGS: + raise ExtractorError( + f'Unsupported API client "{client}" requested. ' + f'Supported clients are: {", ".join(self._CLIENT_CONFIGS)}', expected=True) + + return client + + def _call_videos_api(self, video_id, unlisted_hash=None, path=None, *, force_client=None, query=None, **kwargs): + client = force_client or self._get_requested_client() + + client_config = self._CLIENT_CONFIGS[client] + if client_config['REQUIRES_AUTH'] and not self._is_logged_in: + self.raise_login_required(f'The {client} client requires authentication') - def _call_videos_api(self, video_id, unlisted_hash=None, **kwargs): return self._download_json( - join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), - video_id, 'Downloading API JSON', headers={ - 'Authorization': f'Bearer {self._fetch_oauth_token()}', - **self._IOS_CLIENT_HEADERS, - }, query={ - 'fields': ','.join(( - 'config_url', 'embed_player_config_url', 'player_embed_url', 'download', 'play', - 'files', 'description', 'license', 'release_time', 'created_time', 'stats.plays', - 'metadata.connections.comments.total', 'metadata.connections.likes.total')), + join_nonempty( + 'https://api.vimeo.com/videos', + join_nonempty(video_id, unlisted_hash, delim=':'), + path, delim='/'), + video_id, f'Downloading {client} API JSON', f'Unable to download {client} API JSON', + headers=filter_dict({ + 'Authorization': self._fetch_oauth_token(client), + 'User-Agent': client_config['USER_AGENT'], + **self._CLIENT_HEADERS, + }), query={ + 'fields': ','.join(client_config['VIDEOS_FIELDS']), + **(query or {}), }, **kwargs) - def _extract_original_format(self, url, video_id, unlisted_hash=None, api_data=None): + def _extract_original_format(self, url, video_id, unlisted_hash=None): # Original/source formats are only available when logged in - if not self._get_cookies('https://vimeo.com/').get('vimeo'): - return + if not self._is_logged_in: + return None - query = {'action': 'load_download_config'} - if unlisted_hash: - query['unlisted_hash'] = unlisted_hash - download_data = self._download_json( - url, video_id, 'Loading download config JSON', fatal=False, - query=query, headers={'X-Requested-With': 'XMLHttpRequest'}, - expected_status=(403, 404)) or {} - source_file = download_data.get('source_file') - download_url = try_get(source_file, lambda x: x['download_url']) + policy = self._configuration_arg('original_format_policy', ['auto'], ie_key=VimeoIE)[0] + if policy == 'never': + return None + + try: + download_data = self._download_json( + url, video_id, 'Loading download config JSON', query=filter_dict({ + 'action': 'load_download_config', + 'unlisted_hash': unlisted_hash, + }), headers={ + 'Accept': 'application/json', + 'X-Requested-With': 'XMLHttpRequest', + }) + except ExtractorError as error: + self.write_debug(f'Unable to load download config JSON: {error.cause}') + download_data = None + + source_file = traverse_obj(download_data, ('source_file', {dict})) or {} + download_url = traverse_obj(source_file, ('download_url', {url_or_none})) if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): source_name = source_file.get('public_name', 'Original') if self._is_valid_url(download_url, video_id, f'{source_name} video'): @@ -340,8 +436,27 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None, api_data=N 'quality': 1, } - original_response = api_data or self._call_videos_api( - video_id, unlisted_hash, fatal=False, expected_status=(403, 404)) + # Most web client API requests are subject to rate-limiting (429) when logged-in. + # Requesting only the 'privacy' field is NOT rate-limited, + # so first we should check if video even has 'download' formats available + try: + privacy_info = self._call_videos_api( + video_id, unlisted_hash, force_client='web', query={'fields': 'privacy'}) + except ExtractorError as error: + self.write_debug(f'Unable to download privacy info: {error.cause}') + return None + + if not traverse_obj(privacy_info, ('privacy', 'download', {bool})): + msg = f'{video_id}: Vimeo says this video is not downloadable' + if policy != 'always': + self.write_debug( + f'{msg}, so yt-dlp is not attempting to extract the original/source format. ' + f'To try anyways, use --extractor-args "vimeo:original_format_policy=always"') + return None + self.write_debug(f'{msg}; attempting to extract original/source format anyways') + + original_response = self._call_videos_api( + video_id, unlisted_hash, force_client='web', query={'fields': 'download'}, fatal=False) for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': @@ -919,25 +1034,125 @@ def _verify_player_video_password(self, url, video_id, headers): raise ExtractorError('Wrong video password', expected=True) return checked + def _get_subtitles(self, video_id, unlisted_hash): + subs = {} + text_tracks = self._call_videos_api( + video_id, unlisted_hash, path='texttracks', query={ + 'include_transcript': 'true', + 'fields': ','.join(( + 'active', 'display_language', 'id', 'language', 'link', 'name', 'type', 'uri', + )), + }, fatal=False) + for tt in traverse_obj(text_tracks, ('data', lambda _, v: url_or_none(v['link']))): + subs.setdefault(tt.get('language'), []).append({ + 'url': tt['link'], + 'ext': 'vtt', + 'name': tt.get('display_language'), + }) + return subs + + def _parse_api_response(self, video, video_id, unlisted_hash=None): + formats, subtitles = [], {} + seen_urls = set() + duration = traverse_obj(video, ('duration', {int_or_none})) + + for file in traverse_obj(video, ( + (('play', (None, 'progressive')), 'files', 'download'), lambda _, v: url_or_none(v['link']), + )): + format_url = file['link'] + if format_url in seen_urls: + continue + seen_urls.add(format_url) + quality = file.get('quality') + ext = determine_ext(format_url) + if quality == 'hls' or ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif quality == 'dash' or ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id='dash', fatal=False) + for fmt in fmts: + fmt['format_id'] = join_nonempty( + *fmt['format_id'].split('-', 2)[:2], int_or_none(fmt.get('tbr'))) + else: + fmt = traverse_obj(file, { + 'ext': ('type', {mimetype2ext(default='mp4')}), + 'vcodec': ('codec', {str.lower}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'fps': ('fps', {int_or_none}), + }) + fmt.update({ + 'url': format_url, + 'format_id': join_nonempty( + 'http', traverse_obj(file, 'public_name', 'rendition'), quality), + 'tbr': try_call(lambda: fmt['filesize'] * 8 / duration / 1024), + }) + formats.append(fmt) + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + if traverse_obj(video, ('metadata', 'connections', 'texttracks', 'total', {int})): + self._merge_subtitles(self.extract_subtitles(video_id, unlisted_hash), target=subtitles) + + return { + **traverse_obj(video, { + 'title': ('name', {str}), + 'uploader': ('user', 'name', {str}), + 'uploader_id': ('user', 'link', {url_basename}), + 'uploader_url': ('user', 'link', {url_or_none}), + 'release_timestamp': ('live', 'scheduled_start_time', {int_or_none}), + 'thumbnails': ('pictures', 'sizes', lambda _, v: url_or_none(v['link']), { + 'url': 'link', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + 'id': video_id, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'live_status': { + 'streaming': 'is_live', + 'done': 'was_live', + }.get(traverse_obj(video, ('live', 'status', {str}))), + } + def _extract_from_api(self, video_id, unlisted_hash=None): for retry in (False, True): try: video = self._call_videos_api(video_id, unlisted_hash) break except ExtractorError as e: - if (not retry and isinstance(e.cause, HTTPError) and e.cause.status == 400 - and 'password' in traverse_obj( - self._webpage_read_content(e.cause.response, e.cause.response.url, video_id, fatal=False), - ({json.loads}, 'invalid_parameters', ..., 'field'), - )): + if not isinstance(e.cause, HTTPError): + raise + response = traverse_obj( + self._webpage_read_content(e.cause.response, e.cause.response.url, video_id, fatal=False), + ({json.loads}, {dict})) or {} + if ( + not retry and e.cause.status == 400 + and 'password' in traverse_obj(response, ('invalid_parameters', ..., 'field')) + ): self._verify_video_password(video_id) - continue - raise + elif e.cause.status == 404 and response.get('error_code') == 5460: + self.raise_login_required(join_nonempty( + traverse_obj(response, ('error', {str.strip})), + 'Authentication may be needed due to your location.', + 'If your IP address is located in Europe you could try using a VPN/proxy,', + f'or else u{self._login_hint()[1:]}', + delim=' '), method=None) + else: + raise + + if config_url := traverse_obj(video, ('config_url', {url_or_none})): + info = self._parse_config(self._download_json(config_url, video_id), video_id) + else: + info = self._parse_api_response(video, video_id, unlisted_hash) - info = self._parse_config(self._download_json( - video['config_url'], video_id), video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{video_id}', video_id, unlisted_hash, api_data=video) + f'https://vimeo.com/{video_id}', video_id, unlisted_hash) if source_format: info['formats'].append(source_format) diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index 5aee89b917..0a9b510c7d 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -1,5 +1,6 @@ import calendar import copy +import dataclasses import datetime as dt import enum import functools @@ -38,6 +39,60 @@ class _PoTokenContext(enum.Enum): SUBS = 'subs' +class StreamingProtocol(enum.Enum): + HTTPS = 'https' + DASH = 'dash' + HLS = 'hls' + + +@dataclasses.dataclass +class BasePoTokenPolicy: + required: bool = False + # Try to fetch a PO Token even if it is not required. + recommended: bool = False + not_required_for_premium: bool = False + + +@dataclasses.dataclass +class GvsPoTokenPolicy(BasePoTokenPolicy): + not_required_with_player_token: bool = False + + +@dataclasses.dataclass +class PlayerPoTokenPolicy(BasePoTokenPolicy): + pass + + +@dataclasses.dataclass +class SubsPoTokenPolicy(BasePoTokenPolicy): + pass + + +WEB_PO_TOKEN_POLICIES = { + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + ), + }, + 'PLAYER_PO_TOKEN_POLICY': PlayerPoTokenPolicy(required=False), + # In rollout, currently detected via experiment + # Premium users DO require a PO Token for subtitles + 'SUBS_PO_TOKEN_POLICY': SubsPoTokenPolicy(required=False), +} + # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -48,8 +103,9 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'SUPPORTS_COOKIES': True, + **WEB_PO_TOKEN_POLICIES, + 'PLAYER_PARAMS': '8AEB', }, # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats 'web_safari': { @@ -61,8 +117,8 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'SUPPORTS_COOKIES': True, + **WEB_PO_TOKEN_POLICIES, 'PLAYER_PARAMS': '8AEB', }, 'web_embedded': { @@ -84,7 +140,24 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + ), + }, 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video @@ -96,7 +169,24 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + ), + }, 'REQUIRE_AUTH': True, 'SUPPORTS_COOKIES': True, }, @@ -113,7 +203,24 @@ class _PoTokenContext(enum.Enum): }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_with_player_token=True, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_with_player_token=True, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + not_required_with_player_token=True, + ), + }, + 'PLAYER_PO_TOKEN_POLICY': PlayerPoTokenPolicy(required=False, recommended=True), }, # YouTube Kids videos aren't returned on this client for some reason 'android_vr': { @@ -147,7 +254,21 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_with_player_token=True, + ), + # HLS Livestreams require POT 30 seconds in + # TODO: Rolling out + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + not_required_with_player_token=True, + ), + }, + 'PLAYER_PO_TOKEN_POLICY': PlayerPoTokenPolicy(required=False, recommended=True), 'REQUIRE_JS_PLAYER': False, }, # mweb has 'ultralow' formats @@ -162,7 +283,24 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + ), + }, 'SUPPORTS_COOKIES': True, }, 'tv': { @@ -226,7 +364,11 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) - ytcfg.setdefault('PO_TOKEN_REQUIRED_CONTEXTS', []) + ytcfg.setdefault('GVS_PO_TOKEN_POLICY', {}) + for protocol in StreamingProtocol: + ytcfg['GVS_PO_TOKEN_POLICY'].setdefault(protocol, GvsPoTokenPolicy()) + ytcfg.setdefault('PLAYER_PO_TOKEN_POLICY', PlayerPoTokenPolicy()) + ytcfg.setdefault('SUBS_PO_TOKEN_POLICY', SubsPoTokenPolicy()) ytcfg.setdefault('REQUIRE_AUTH', False) ytcfg.setdefault('SUPPORTS_COOKIES', False) ytcfg.setdefault('PLAYER_PARAMS', None) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 208abee937..fc1f087ace 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -18,6 +18,9 @@ from ._base import ( INNERTUBE_CLIENTS, BadgeType, + GvsPoTokenPolicy, + PlayerPoTokenPolicy, + StreamingProtocol, YoutubeBaseInfoExtractor, _PoTokenContext, _split_innertube_client, @@ -71,9 +74,11 @@ from ...utils.networking import clean_headers, clean_proxies, select_proxy STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' -STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token' STREAMING_DATA_FETCH_SUBS_PO_TOKEN = '__yt_dlp_fetch_subs_po_token' +STREAMING_DATA_FETCH_GVS_PO_TOKEN = '__yt_dlp_fetch_gvs_po_token' +STREAMING_DATA_PLAYER_TOKEN_PROVIDED = '__yt_dlp_player_token_provided' STREAMING_DATA_INNERTUBE_CONTEXT = '__yt_dlp_innertube_context' +STREAMING_DATA_IS_PREMIUM_SUBSCRIBER = '__yt_dlp_is_premium_subscriber' PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide' @@ -253,6 +258,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'srt', 'vtt') _DEFAULT_CLIENTS = ('tv', 'ios', 'web') _DEFAULT_AUTHED_CLIENTS = ('tv', 'web') + # Premium does not require POT (except for subtitles) + _DEFAULT_PREMIUM_CLIENTS = ('tv', 'web') _GEO_BYPASS = False @@ -1833,7 +1840,8 @@ def refetch_manifest(format_id, delay): if time.time() <= start_time + delay: return - _, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) + _, _, _, _, prs, player_url = self._initial_extract( + url, smuggled_data, webpage_url, 'web', video_id) video_details = traverse_obj(prs, (..., 'videoDetails'), expected_type=dict) microformats = traverse_obj( prs, (..., 'microformat', 'playerMicroformatRenderer'), @@ -2891,7 +2899,7 @@ def _get_config_po_token(self, client: str, context: _PoTokenContext): only_once=True) continue - def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None, + def fetch_po_token(self, client='web', context: _PoTokenContext = _PoTokenContext.GVS, ytcfg=None, visitor_data=None, data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, required=False, **kwargs): """ @@ -2976,7 +2984,6 @@ def _fetch_po_token(self, client, **kwargs): fetch_pot_policy == 'never' or ( fetch_pot_policy == 'auto' - and _PoTokenContext(context) not in self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] and not kwargs.get('required', False) ) ): @@ -3035,19 +3042,19 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): + def _extract_player_response(self, client, video_id, webpage_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): headers = self.generate_api_headers( ytcfg=player_ytcfg, default_client=client, visitor_data=visitor_data, - session_index=self._extract_session_index(master_ytcfg, player_ytcfg), + session_index=self._extract_session_index(webpage_ytcfg, player_ytcfg), delegated_session_id=( self._parse_data_sync_id(data_sync_id)[0] - or self._extract_delegated_session_id(master_ytcfg, initial_pr, player_ytcfg) + or self._extract_delegated_session_id(webpage_ytcfg, initial_pr, player_ytcfg) ), user_session_id=( self._parse_data_sync_id(data_sync_id)[1] - or self._extract_user_session_id(master_ytcfg, initial_pr, player_ytcfg) + or self._extract_user_session_id(webpage_ytcfg, initial_pr, player_ytcfg) ), ) @@ -3063,7 +3070,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, if po_token: yt_query['serviceIntegrityDimensions'] = {'poToken': po_token} - sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None + sts = self._extract_signature_timestamp(video_id, player_url, webpage_ytcfg, fatal=False) if player_url else None yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3072,10 +3079,14 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, note='Downloading {} player API JSON'.format(client.replace('_', ' ').strip()), ) or None - def _get_requested_clients(self, url, smuggled_data): + def _get_requested_clients(self, url, smuggled_data, is_premium_subscriber): requested_clients = [] excluded_clients = [] - default_clients = self._DEFAULT_AUTHED_CLIENTS if self.is_authenticated else self._DEFAULT_CLIENTS + default_clients = ( + self._DEFAULT_PREMIUM_CLIENTS if is_premium_subscriber + else self._DEFAULT_AUTHED_CLIENTS if self.is_authenticated + else self._DEFAULT_CLIENTS + ) allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) @@ -3117,11 +3128,12 @@ def _invalid_player_response(self, pr, video_id): if (pr_id := traverse_obj(pr, ('videoDetails', 'videoId'))) != video_id: return pr_id - def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): + def _extract_player_responses(self, clients, video_id, webpage, webpage_client, webpage_ytcfg, is_premium_subscriber): initial_pr = None if webpage: initial_pr = self._search_json( - self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) + self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, + f'{webpage_client} client initial player response', video_id, fatal=False) prs = [] deprioritized_prs = [] @@ -3152,11 +3164,11 @@ def append_client(*client_names): while clients: deprioritize_pr = False client, base_client, variant = _split_innertube_client(clients.pop()) - player_ytcfg = master_ytcfg if client == 'web' else {} - if 'configs' not in self._configuration_arg('player_skip') and client != 'web': + player_ytcfg = webpage_ytcfg if client == webpage_client else {} + if 'configs' not in self._configuration_arg('player_skip') and client != webpage_client: player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg - player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) + player_url = player_url or self._extract_player_url(webpage_ytcfg, player_ytcfg, webpage=webpage) require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER') if 'js' in self._configuration_arg('player_skip'): require_js_player = False @@ -3166,10 +3178,12 @@ def append_client(*client_names): player_url = self._download_player_url(video_id) tried_iframe_fallback = True - pr = initial_pr if client == 'web' else None + pr = None + if client == webpage_client and 'player_response' not in self._configuration_arg('webpage_skip'): + pr = initial_pr - visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) - data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) + visitor_data = visitor_data or self._extract_visitor_data(webpage_ytcfg, initial_pr, player_ytcfg) + data_sync_id = data_sync_id or self._extract_data_sync_id(webpage_ytcfg, initial_pr, player_ytcfg) fetch_po_token_args = { 'client': client, @@ -3178,53 +3192,26 @@ def append_client(*client_names): 'data_sync_id': data_sync_id if self.is_authenticated else None, 'player_url': player_url if require_js_player else None, 'webpage': webpage, - 'session_index': self._extract_session_index(master_ytcfg, player_ytcfg), + 'session_index': self._extract_session_index(webpage_ytcfg, player_ytcfg), 'ytcfg': player_ytcfg or self._get_default_ytcfg(client), } # Don't need a player PO token for WEB if using player response from webpage + player_pot_policy: PlayerPoTokenPolicy = self._get_default_ytcfg(client)['PLAYER_PO_TOKEN_POLICY'] player_po_token = None if pr else self.fetch_po_token( - context=_PoTokenContext.PLAYER, **fetch_po_token_args) + context=_PoTokenContext.PLAYER, **fetch_po_token_args, + required=player_pot_policy.required or player_pot_policy.recommended) - gvs_po_token = self.fetch_po_token( - context=_PoTokenContext.GVS, **fetch_po_token_args) + fetch_gvs_po_token_func = functools.partial( + self.fetch_po_token, context=_PoTokenContext.GVS, **fetch_po_token_args) fetch_subs_po_token_func = functools.partial( - self.fetch_po_token, - context=_PoTokenContext.SUBS, - **fetch_po_token_args, - ) - - required_pot_contexts = self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] - - if ( - not player_po_token - and _PoTokenContext.PLAYER in required_pot_contexts - ): - # TODO: may need to skip player response request. Unsure yet.. - self.report_warning( - f'No Player PO Token provided for {client} client, ' - f'which may be required for working {client} formats. This client will be deprioritized' - f'You can manually pass a Player PO Token for this client with --extractor-args "youtube:po_token={client}.player+XXX". ' - f'For more information, refer to {PO_TOKEN_GUIDE_URL} .', only_once=True) - deprioritize_pr = True - - if ( - not gvs_po_token - and _PoTokenContext.GVS in required_pot_contexts - and 'missing_pot' in self._configuration_arg('formats') - ): - # note: warning with help message is provided later during format processing - self.report_warning( - f'No GVS PO Token provided for {client} client, ' - f'which may be required for working {client} formats. This client will be deprioritized', - only_once=True) - deprioritize_pr = True + self.fetch_po_token, context=_PoTokenContext.SUBS, **fetch_po_token_args) try: pr = pr or self._extract_player_response( client, video_id, - master_ytcfg=player_ytcfg or master_ytcfg, + webpage_ytcfg=player_ytcfg or webpage_ytcfg, player_ytcfg=player_ytcfg, player_url=player_url, initial_pr=initial_pr, @@ -3242,12 +3229,16 @@ def append_client(*client_names): innertube_context = traverse_obj(player_ytcfg or self._get_default_ytcfg(client), 'INNERTUBE_CONTEXT') sd = pr.setdefault('streamingData', {}) sd[STREAMING_DATA_CLIENT_NAME] = client - sd[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token + sd[STREAMING_DATA_FETCH_GVS_PO_TOKEN] = fetch_gvs_po_token_func + sd[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] = bool(player_po_token) sd[STREAMING_DATA_INNERTUBE_CONTEXT] = innertube_context sd[STREAMING_DATA_FETCH_SUBS_PO_TOKEN] = fetch_subs_po_token_func + sd[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] = is_premium_subscriber for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): f[STREAMING_DATA_CLIENT_NAME] = client - f[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token + f[STREAMING_DATA_FETCH_GVS_PO_TOKEN] = fetch_gvs_po_token_func + f[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] = is_premium_subscriber + f[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] = bool(player_po_token) if deprioritize_pr: deprioritized_prs.append(pr) else: @@ -3357,6 +3348,15 @@ def build_fragments(f): }), } for range_start in range(0, f['filesize'], CHUNK_SIZE)) + def gvs_pot_required(policy, is_premium_subscriber, has_player_token): + return ( + policy.required + and not (policy.not_required_with_player_token and has_player_token) + and not (policy.not_required_for_premium and is_premium_subscriber)) + + # save pots per client to avoid fetching again + gvs_pots = {} + for fmt in streaming_formats: client_name = fmt[STREAMING_DATA_CLIENT_NAME] if fmt.get('targetDurationSec'): @@ -3416,7 +3416,7 @@ def build_fragments(f): encrypted_sig = try_get(sc, lambda x: x['s'][0]) if not all((sc, fmt_url, player_url, encrypted_sig)): msg = f'Some {client_name} client https formats have been skipped as they are missing a url. ' - if client_name == 'web': + if client_name in ('web', 'web_safari'): msg += 'YouTube is forcing SABR streaming for this client. ' else: msg += ( @@ -3476,18 +3476,25 @@ def build_fragments(f): self.report_warning( 'Some formats are possibly damaged. They will be deprioritized', video_id, only_once=True) - po_token = fmt.get(STREAMING_DATA_INITIAL_PO_TOKEN) + fetch_po_token_func = fmt[STREAMING_DATA_FETCH_GVS_PO_TOKEN] + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg(client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.HTTPS] + + require_po_token = ( + itag not in ['18'] + and gvs_pot_required( + pot_policy, fmt[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER], + fmt[STREAMING_DATA_PLAYER_TOKEN_PROVIDED])) + + po_token = ( + gvs_pots.get(client_name) + or fetch_po_token_func(required=require_po_token or pot_policy.recommended)) if po_token: fmt_url = update_url_query(fmt_url, {'pot': po_token}) + if client_name not in gvs_pots: + gvs_pots[client_name] = po_token - # Clients that require PO Token return videoplayback URLs that may return 403 - require_po_token = ( - not po_token - and _PoTokenContext.GVS in self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] - and itag not in ['18']) # these formats do not require PO Token - - if require_po_token and 'missing_pot' not in self._configuration_arg('formats'): + if not po_token and require_po_token and 'missing_pot' not in self._configuration_arg('formats'): self._report_pot_format_skipped(video_id, client_name, 'https') continue @@ -3502,7 +3509,7 @@ def build_fragments(f): name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - is_damaged and 'DAMAGED', require_po_token and 'MISSING POT', + is_damaged and 'DAMAGED', require_po_token and not po_token and 'MISSING POT', (self.get_param('verbose') or all_formats) and short_client_name(client_name), delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 @@ -3565,7 +3572,7 @@ def build_fragments(f): elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') - def process_manifest_format(f, proto, client_name, itag, po_token): + def process_manifest_format(f, proto, client_name, itag, missing_pot): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False @@ -3573,20 +3580,11 @@ def process_manifest_format(f, proto, client_name, itag, po_token): if f.get('source_preference') is None: f['source_preference'] = -1 - # Clients that require PO Token return videoplayback URLs that may return 403 - # hls does not currently require PO Token - if ( - not po_token - and _PoTokenContext.GVS in self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] - and proto != 'hls' - ): - if 'missing_pot' not in self._configuration_arg('formats'): - self._report_pot_format_skipped(video_id, client_name, proto) - return False + if missing_pot: f['format_note'] = join_nonempty(f.get('format_note'), 'MISSING POT', delim=' ') f['source_preference'] -= 20 - # XXX: Check if IOS HLS formats are affected by player PO token enforcement; temporary + # XXX: Check if IOS HLS formats are affected by PO token enforcement; temporary # See https://github.com/yt-dlp/yt-dlp/issues/13511 if proto == 'hls' and client_name == 'ios': f['__needs_testing'] = True @@ -3625,39 +3623,62 @@ def process_manifest_format(f, proto, client_name, itag, po_token): subtitles = {} for sd in streaming_data: client_name = sd[STREAMING_DATA_CLIENT_NAME] - po_token = sd.get(STREAMING_DATA_INITIAL_PO_TOKEN) + fetch_pot_func = sd[STREAMING_DATA_FETCH_GVS_PO_TOKEN] + is_premium_subscriber = sd[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] + has_player_token = sd[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] + hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg( + client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.HLS] + require_po_token = gvs_pot_required(pot_policy, is_premium_subscriber, has_player_token) + po_token = gvs_pots.get(client_name, fetch_pot_func(required=require_po_token or pot_policy.recommended)) if po_token: hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' - fmts, subs = self._extract_m3u8_formats_and_subtitles( - hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') - for sub in traverse_obj(subs, (..., ..., {dict})): - # HLS subs (m3u8) do not need a PO token; save client name for debugging - sub[STREAMING_DATA_CLIENT_NAME] = client_name - subtitles = self._merge_subtitles(subs, subtitles) - for f in fmts: - if process_manifest_format(f, 'hls', client_name, self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None), po_token): - yield f + if client_name not in gvs_pots: + gvs_pots[client_name] = po_token + if require_po_token and not po_token and 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, 'hls') + else: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') + for sub in traverse_obj(subs, (..., ..., {dict})): + # TODO: If HLS video requires a PO Token, do the subs also require pot? + # Save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name + subtitles = self._merge_subtitles(subs, subtitles) + for f in fmts: + if process_manifest_format(f, 'hls', client_name, self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None), require_po_token and not po_token): + yield f dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg( + client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.DASH] + require_po_token = gvs_pot_required(pot_policy, is_premium_subscriber, has_player_token) + po_token = gvs_pots.get(client_name, fetch_pot_func(required=require_po_token or pot_policy.recommended)) if po_token: dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' - formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) - for sub in traverse_obj(subs, (..., ..., {dict})): - # TODO: Investigate if DASH subs ever need a PO token; save client name for debugging - sub[STREAMING_DATA_CLIENT_NAME] = client_name - subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH - for f in formats: - if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): - f['filesize'] = int_or_none(self._search_regex( - r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) - if needs_live_processing: - f['is_from_start'] = True + if client_name not in gvs_pots: + gvs_pots[client_name] = po_token + if require_po_token and not po_token and 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, 'dash') + else: + formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) + for sub in traverse_obj(subs, (..., ..., {dict})): + # TODO: If DASH video requires a PO Token, do the subs also require pot? + # Save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name + subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH + for f in formats: + if process_manifest_format(f, 'dash', client_name, f['format_id'], require_po_token and not po_token): + f['filesize'] = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) + if needs_live_processing: + f['is_from_start'] = True - yield f + yield f yield subtitles def _extract_storyboard(self, player_responses, duration): @@ -3698,22 +3719,22 @@ def _extract_storyboard(self, player_responses, duration): } for j in range(math.ceil(fragment_count))], } - def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): + def _download_initial_webpage(self, webpage_url, webpage_client, video_id): webpage = None - if 'webpage' not in self._configuration_arg('player_skip'): + if webpage_url and 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} - pp = self._configuration_arg('player_params', [None], casesense=True)[0] + pp = ( + self._configuration_arg('player_params', [None], casesense=True)[0] + or traverse_obj(INNERTUBE_CLIENTS, (webpage_client, 'PLAYER_PARAMS', {str})) + ) if pp: query['pp'] = pp - webpage = self._download_webpage_with_retries(webpage_url, video_id, query=query) - - master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() - - player_responses, player_url = self._extract_player_responses( - self._get_requested_clients(url, smuggled_data), - video_id, webpage, master_ytcfg, smuggled_data) - - return webpage, master_ytcfg, player_responses, player_url + webpage = self._download_webpage_with_retries( + webpage_url, video_id, query=query, + headers=traverse_obj(self._get_default_ytcfg(webpage_client), { + 'User-Agent': ('INNERTUBE_CONTEXT', 'client', 'userAgent', {str}), + })) + return webpage def _list_formats(self, video_id, microformats, video_details, player_responses, player_url, duration=None): live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) @@ -3738,14 +3759,60 @@ def _list_formats(self, video_id, microformats, video_details, player_responses, return live_broadcast_details, live_status, streaming_data, formats, subtitles + def _download_initial_data(self, video_id, webpage, webpage_client, webpage_ytcfg): + initial_data = None + if webpage and 'initial_data' not in self._configuration_arg('webpage_skip'): + initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False) + if not traverse_obj(initial_data, 'contents'): + self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') + initial_data = None + if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): + query = {'videoId': video_id} + query.update(self._get_checkok_params()) + initial_data = self._extract_response( + item_id=video_id, ep='next', fatal=False, + ytcfg=webpage_ytcfg, query=query, check_get_keys='contents', + note='Downloading initial data API JSON', default_client=webpage_client) + return initial_data + + def _is_premium_subscriber(self, initial_data): + if not self.is_authenticated or not initial_data: + return False + + tlr = traverse_obj( + initial_data, ('topbar', 'desktopTopbarRenderer', 'logo', 'topbarLogoRenderer')) + return ( + traverse_obj(tlr, ('iconImage', 'iconType')) == 'YOUTUBE_PREMIUM_LOGO' + or 'premium' in (self._get_text(tlr, 'tooltipText') or '').lower() + ) + + def _initial_extract(self, url, smuggled_data, webpage_url, webpage_client, video_id): + # This function is also used by live-from-start refresh + webpage = self._download_initial_webpage(webpage_url, webpage_client, video_id) + webpage_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg(webpage_client) + + initial_data = self._download_initial_data(video_id, webpage, webpage_client, webpage_ytcfg) + + is_premium_subscriber = self._is_premium_subscriber(initial_data) + if is_premium_subscriber: + self.write_debug('Detected YouTube Premium subscription') + + player_responses, player_url = self._extract_player_responses( + self._get_requested_clients(url, smuggled_data, is_premium_subscriber), + video_id, webpage, webpage_client, webpage_ytcfg, is_premium_subscriber) + + return webpage, webpage_ytcfg, initial_data, is_premium_subscriber, player_responses, player_url + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id + webpage_client = 'web' - webpage, master_ytcfg, player_responses, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) + webpage, webpage_ytcfg, initial_data, is_premium_subscriber, player_responses, player_url = self._initial_extract( + url, smuggled_data, webpage_url, webpage_client, video_id) playability_statuses = traverse_obj( player_responses, (..., 'playabilityStatus'), expected_type=dict) @@ -4020,7 +4087,7 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer pctr = pr['captions']['playerCaptionsTracklistRenderer'] client_name = pr['streamingData'][STREAMING_DATA_CLIENT_NAME] innertube_client_name = pr['streamingData'][STREAMING_DATA_INNERTUBE_CONTEXT]['client']['clientName'] - required_contexts = self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg(client_name)['SUBS_PO_TOKEN_POLICY'] fetch_subs_po_token_func = pr['streamingData'][STREAMING_DATA_FETCH_SUBS_PO_TOKEN] pot_params = {} @@ -4033,11 +4100,11 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer requires_pot = ( # We can detect the experiment for now any(e in traverse_obj(qs, ('exp', ...)) for e in ('xpe', 'xpv')) - or _PoTokenContext.SUBS in required_contexts) + or (pot_policy.required and not (pot_policy.not_required_for_premium and is_premium_subscriber))) if not already_fetched_pot: already_fetched_pot = True - if subs_po_token := fetch_subs_po_token_func(required=requires_pot): + if subs_po_token := fetch_subs_po_token_func(required=requires_pot or pot_policy.recommended): pot_params.update({ 'pot': subs_po_token, 'potc': '1', @@ -4140,21 +4207,6 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer 'release_year': int_or_none(release_year), }) - initial_data = None - if webpage: - initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False) - if not traverse_obj(initial_data, 'contents'): - self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') - initial_data = None - if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): - query = {'videoId': video_id} - query.update(self._get_checkok_params()) - initial_data = self._extract_response( - item_id=video_id, ep='next', fatal=False, - ytcfg=master_ytcfg, query=query, check_get_keys='contents', - headers=self.generate_api_headers(ytcfg=master_ytcfg), - note='Downloading initial data API JSON') - COMMENTS_SECTION_IDS = ('comment-item-section', 'engagement-panel-comments-section') info['comment_count'] = traverse_obj(initial_data, ( 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer', @@ -4353,7 +4405,7 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or get_first(microformats, 'isUnlisted', expected_type=bool)))) - info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage) + info['__post_extractor'] = self.extract_comments(webpage_ytcfg, video_id, contents, webpage) self.mark_watched(video_id, player_responses)