From c23d837b6524d1e7a4595948871ba1708cba4dfa Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 7 Jul 2025 15:25:34 -0500 Subject: [PATCH 01/11] [ie/youtube:tab] Fix subscriptions feed extraction (#13665) Adds support for LOCKUP_CONTENT_TYPE_VIDEO view models Closes #13658 Authored by: bashonly --- yt_dlp/extractor/youtube/_tab.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/youtube/_tab.py b/yt_dlp/extractor/youtube/_tab.py index c018ee8cfb..226e5ede3b 100644 --- a/yt_dlp/extractor/youtube/_tab.py +++ b/yt_dlp/extractor/youtube/_tab.py @@ -317,17 +317,31 @@ def _extract_lockup_view_model(self, view_model): content_id = view_model.get('contentId') if not content_id: return + content_type = view_model.get('contentType') - if content_type not in ('LOCKUP_CONTENT_TYPE_PLAYLIST', 'LOCKUP_CONTENT_TYPE_PODCAST'): + if content_type == 'LOCKUP_CONTENT_TYPE_VIDEO': + ie = YoutubeIE + url = f'https://www.youtube.com/watch?v={content_id}' + thumb_keys = (None,) + elif content_type in ('LOCKUP_CONTENT_TYPE_PLAYLIST', 'LOCKUP_CONTENT_TYPE_PODCAST'): + ie = YoutubeTabIE + url = f'https://www.youtube.com/playlist?list={content_id}' + thumb_keys = ('collectionThumbnailViewModel', 'primaryThumbnail') + else: self.report_warning( - f'Unsupported lockup view model content type "{content_type}"{bug_reports_message()}', only_once=True) + f'Unsupported lockup view model content type "{content_type}"{bug_reports_message()}', + only_once=True) return + return self.url_result( - f'https://www.youtube.com/playlist?list={content_id}', ie=YoutubeTabIE, video_id=content_id, + url, ie, content_id, title=traverse_obj(view_model, ( 'metadata', 'lockupMetadataViewModel', 'title', 'content', {str})), thumbnails=self._extract_thumbnails(view_model, ( - 'contentImage', 'collectionThumbnailViewModel', 'primaryThumbnail', 'thumbnailViewModel', 'image'), final_key='sources')) + 'contentImage', *thumb_keys, 'thumbnailViewModel', 'image'), final_key='sources'), + duration=traverse_obj(view_model, ( + 'contentImage', 'thumbnailViewModel', 'overlays', ..., 'thumbnailOverlayBadgeViewModel', + 'thumbnailBadges', ..., 'thumbnailBadgeViewModel', 'text', {parse_duration}, any))) def _rich_entries(self, rich_grid_renderer): if lockup_view_model := traverse_obj(rich_grid_renderer, ('content', 'lockupViewModel', {dict})): From 884f35d54a64f1e6e7be49459842f573fc3a2701 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 7 Jul 2025 22:54:27 -0500 Subject: [PATCH 02/11] [ie/BiliBiliBangumi] Fix geo-block detection (#13667) Closes #13634 Authored by: bashonly --- yt_dlp/extractor/bilibili.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 0f5c2c97e4..0c6535fc72 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -900,7 +900,9 @@ def _real_extract(self, url): headers=headers)) geo_blocked = traverse_obj(play_info, ( - 'raw', 'data', 'plugins', lambda _, v: v['name'] == 'AreaLimitPanel', 'config', 'is_block', {bool}, any)) + ('result', ('raw', 'data')), 'plugins', + lambda _, v: v['name'] == 'AreaLimitPanel', + 'config', 'is_block', {bool}, any)) premium_only = play_info.get('code') == -10403 video_info = traverse_obj(play_info, (('result', ('raw', 'data')), 'video_info', {dict}, any)) or {} @@ -914,7 +916,7 @@ def _real_extract(self, url): if traverse_obj(play_info, (( ('result', 'play_check', 'play_detail'), # 'PLAY_PREVIEW' vs 'PLAY_WHOLE' - ('raw', 'data', 'play_video_type'), # 'preview' vs 'whole' + (('result', ('raw', 'data')), 'play_video_type'), # 'preview' vs 'whole' vs 'none' ), any, {lambda x: x in ('PLAY_PREVIEW', 'preview')})): self.report_warning( 'Only preview format is available, ' From 7c49a937887756efcfa162abdcf17e48c244cb0c Mon Sep 17 00:00:00 2001 From: garret1317 Date: Tue, 8 Jul 2025 04:55:19 +0100 Subject: [PATCH 03/11] [ie/NhkRadiru] Fix metadata extraction (#12708) Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 299 ++++++++++++++++++++++++++-------------- 1 file changed, 194 insertions(+), 105 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 0d5e5b0e7e..14fbd6ce82 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -8,6 +8,8 @@ get_element_by_class, int_or_none, join_nonempty, + make_archive_id, + orderedSet, parse_duration, remove_end, traverse_obj, @@ -16,6 +18,7 @@ unified_timestamp, url_or_none, urljoin, + variadic, ) @@ -591,102 +594,179 @@ class NhkRadiruIE(InfoExtractor): IE_DESC = 'NHK らじる (Radiru/Rajiru)' _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P[\da-zA-Z]+)_(?P[\da-zA-Z]+)(?:_(?P[\da-zA-Z]+))?' _TESTS = [{ - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239', - 'skip': 'Episode expired on 2024-06-09', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=LG96ZW5KZ4_01_4251382', + 'skip': 'Episode expires on 2025-07-14', 'info_dict': { - 'title': 'ジャズ・トゥナイト ジャズ「Night and Day」特集', - 'id': '0449_01_4003239', + 'title': 'クラシックの庭\u3000特集「ドボルザークを聴く」(1)交響曲を中心に', + 'id': 'LG96ZW5KZ4_01_4251382', 'ext': 'm4a', - 'uploader': 'NHK FM 東京', - 'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc', - 'series': 'ジャズ・トゥナイト', - 'channel': 'NHK FM 東京', - 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg', - 'upload_date': '20240601', - 'series_id': '0449_01', - 'release_date': '20240601', - 'timestamp': 1717257600, - 'release_timestamp': 1717250400, + 'description': 'md5:652d3c38a25b77959c716421eba1617a', + 'uploader': 'NHK FM・東京', + 'channel': 'NHK FM・東京', + 'duration': 6597.0, + 'thumbnail': 'https://www.nhk.jp/static/assets/images/radioseries/rs/LG96ZW5KZ4/LG96ZW5KZ4-eyecatch_a67c6e949325016c0724f2ed3eec8a2f.jpg', + 'categories': ['音楽', 'クラシック・オペラ'], + 'cast': ['田添菜穂子'], + 'series': 'クラシックの庭', + 'series_id': 'LG96ZW5KZ4', + 'episode': '特集「ドボルザークを聴く」(1)交響曲を中心に', + 'episode_id': 'QP1Q2ZXZY3', + 'timestamp': 1751871000, + 'upload_date': '20250707', + 'release_timestamp': 1751864403, + 'release_date': '20250707', }, }, { # playlist, airs every weekday so it should _hopefully_ be okay forever - 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01', + 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=Z9L1V2M24L_01', 'info_dict': { - 'id': '0458_01', + 'id': 'Z9L1V2M24L_01', 'title': 'ベストオブクラシック', 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', - 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', - 'series_id': '0458_01', + 'thumbnail': 'https://www.nhk.jp/static/assets/images/radioseries/rs/Z9L1V2M24L/Z9L1V2M24L-eyecatch_83ed28b4782907998875965fee60a351.jpg', + 'series_id': 'Z9L1V2M24L_01', 'uploader': 'NHK FM', 'channel': 'NHK FM', 'series': 'ベストオブクラシック', }, 'playlist_mincount': 3, - }, { - # one with letters in the id - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688', - 'note': 'Expires on 2025-03-31', - 'info_dict': { - 'id': 'F683_01_3910688', - 'ext': 'm4a', - 'title': '夏目漱石「文鳥」第1回', - 'series': '【らじる文庫】夏目漱石「文鳥」(全4回)', - 'series_id': 'F683_01', - 'description': '朗読:浅井理アナウンサー', - 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg', - 'upload_date': '20240106', - 'release_date': '20240106', - 'uploader': 'NHK R1', - 'release_timestamp': 1704511800, - 'channel': 'NHK R1', - 'timestamp': 1704512700, - }, - 'expected_warnings': ['Unable to download JSON metadata', - 'Failed to get extended metadata. API returned Error 1: Invalid parameters'], }, { # news - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=18439M2W42_02_4251212', + 'skip': 'Expires on 2025-07-15', 'info_dict': { - 'id': 'F261_01_4012173', + 'id': '18439M2W42_02_4251212', 'ext': 'm4a', - 'channel': 'NHKラジオ第1', + 'title': 'マイあさ! 午前5時のNHKニュース 2025年7月8日', 'uploader': 'NHKラジオ第1', + 'channel': 'NHKラジオ第1', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/18439M2W42/img/series_945_thumbnail.jpg', 'series': 'NHKラジオニュース', - 'title': '午前0時のNHKニュース', - 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', - 'release_timestamp': 1718290800, - 'release_date': '20240613', - 'timestamp': 1718291400, - 'upload_date': '20240613', + 'timestamp': 1751919420, + 'upload_date': '20250707', + 'release_timestamp': 1751918400, + 'release_date': '20250707', }, }, { # fallback when extended metadata fails - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298', - 'skip': 'Expires on 2024-06-07', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=J8792PY43V_20_4253945', + 'skip': 'Expires on 2025-09-01', 'info_dict': { - 'id': '2834_01_4009298', - 'title': 'まち☆キラ!開成町特集', + 'id': 'J8792PY43V_20_4253945', 'ext': 'm4a', - 'release_date': '20240531', - 'upload_date': '20240531', - 'series': 'はま☆キラ!', - 'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg', - 'channel': 'NHK R1,FM', - 'description': '', - 'timestamp': 1717123800, - 'uploader': 'NHK R1,FM', - 'release_timestamp': 1717120800, - 'series_id': '2834_01', + 'title': '「後絶たない筋肉増強剤の使用」ワールドリポート', + 'description': '大濱 敦(ソウル支局)', + 'uploader': 'NHK R1', + 'channel': 'NHK R1', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/J8792PY43V/img/corner/box_31_thumbnail.jpg', + 'series': 'マイあさ! ワールドリポート', + 'series_id': 'J8792PY43V_20', + 'timestamp': 1751837100, + 'upload_date': '20250706', + 'release_timestamp': 1751835600, + 'release_date': '20250706', + }, - 'expected_warnings': ['Failed to get extended metadata. API returned empty list.'], + 'expected_warnings': ['Failed to download extended metadata: HTTP Error 404: Not Found'], }] _API_URL_TMPL = None + # The `_format_*` and `_make_*` functions are ported from: https://www.nhk.or.jp/radio/assets/js/timetable_detail_new.js + + def _format_act_list(self, act_list): + role_groups = {} + for act in traverse_obj(act_list, (..., {dict})): + role = act.get('role') + if role not in role_groups: + role_groups[role] = [] + role_groups[role].append(act) + + formatted_roles = [] + for role, acts in role_groups.items(): + for i, act in enumerate(acts): + res = f'【{role}】' if i == 0 and role is not None else '' + if title := act.get('title'): + res += f'{title}…' + formatted_roles.append(join_nonempty(res, act.get('name'), delim='')) + return join_nonempty(*formatted_roles, delim=',') + + def _make_artists(self, track, key): + artists = [] + for artist in traverse_obj(track, (key, ..., {dict})): + if res := join_nonempty(*traverse_obj(artist, (( + ('role', filter, {'{}…'.format}), + ('part', filter, {'({})'.format}), + ('name', filter), + ), {str})), delim=''): + artists.append(res) + + return '、'.join(artists) or None + + def _make_duration(self, track, key): + d = traverse_obj(track, (key, {parse_duration})) + if d is None: + return None + hours, remainder = divmod(d, 3600) + minutes, seconds = divmod(remainder, 60) + res = '(' + if hours > 0: + res += f'{int(hours)}時間' + if minutes > 0: + res += f'{int(minutes)}分' + res += f'{int(seconds):02}秒)' + return res + + def _format_music_list(self, music_list): + tracks = [] + for track in traverse_obj(music_list, (..., {dict})): + track_details = traverse_obj(track, (( + ('name', filter, {'「{}」'.format}), + ('lyricist', filter, {'{}:作詞'.format}), + ('composer', filter, {'{}:作曲'.format}), + ('arranger', filter, {'{}:編曲'.format}), + ), {str})) + + track_details.append(self._make_artists(track, 'byArtist')) + track_details.append(self._make_duration(track, 'duration')) + + if label := join_nonempty('label', 'code', delim=' ', from_dict=track): + track_details.append(f'<{label}>') + if location := traverse_obj(track, ('location', {str})): + track_details.append(f'~{location}~') + tracks.append(join_nonempty(*track_details, delim='\n')) + return '\n\n'.join(tracks) + + def _format_description(self, response): + detailed_description = traverse_obj(response, ('detailedDescription', {dict})) or {} + return join_nonempty( + join_nonempty('epg80', 'epg200', delim='\n\n', from_dict=detailed_description), + traverse_obj(response, ('misc', 'actList', {self._format_act_list})), + traverse_obj(response, ('misc', 'musicList', {self._format_music_list})), + delim='\n\n') + + def _get_thumbnails(self, data, keys, name=None, preference=-1): + thumbnails = [] + for size, thumb in traverse_obj(data, ( + *variadic(keys, (str, bytes, dict, set)), {dict.items}, + lambda _, v: v[0] != 'copyright' and url_or_none(v[1]['url']), + )): + thumbnails.append({ + 'url': thumb['url'], + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + 'preference': preference, + 'id': join_nonempty(name, size), + }) + preference -= 1 + return thumbnails + def _extract_extended_metadata(self, episode_id, aa_vinfo): service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')})) + date_id = aa_vinfo[3] + detail_url = try_call( - lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3])) + lambda: self._API_URL_TMPL.format(broadcastEventId=join_nonempty(service, area, date_id))) if not detail_url: return {} @@ -699,36 +779,37 @@ def _extract_extended_metadata(self, episode_id, aa_vinfo): if error := traverse_obj(response, ('error', {dict})): self.report_warning( 'Failed to get extended metadata. API returned ' - f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}') + f'Error {join_nonempty("statuscode", "message", from_dict=error, delim=": ")}') return {} - full_meta = traverse_obj(response, ('list', service, 0, {dict})) - if not full_meta: - self.report_warning('Failed to get extended metadata. API returned empty list.') - return {} + station = traverse_obj(response, ('publishedOn', 'broadcastDisplayName', {str})) - station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None - thumbnails = [{ - 'id': str(id_), - 'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1, - **traverse_obj(thumb, { - 'url': 'url', - 'width': ('width', {int_or_none}), - 'height': ('height', {int_or_none}), - }), - } for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))] + thumbnails = [] + thumbnails.extend(self._get_thumbnails(response, ('about', 'eyecatch'))) + for num, dct in enumerate(traverse_obj(response, ('about', 'eyecatchList', ...))): + thumbnails.extend(self._get_thumbnails(dct, None, join_nonempty('list', num), -2)) + thumbnails.extend( + self._get_thumbnails(response, ('about', 'partOfSeries', 'eyecatch'), 'series', -3)) return filter_dict({ + 'description': self._format_description(response), + 'cast': traverse_obj(response, ('misc', 'actList', ..., 'name', {str})), + 'thumbnails': thumbnails, + **traverse_obj(response, { + 'title': ('name', {str}), + 'timestamp': ('endDate', {unified_timestamp}), + 'release_timestamp': ('startDate', {unified_timestamp}), + 'duration': ('duration', {parse_duration}), + }), + **traverse_obj(response, ('identifierGroup', { + 'series': ('radioSeriesName', {str}), + 'series_id': ('radioSeriesId', {str}), + 'episode': ('radioEpisodeName', {str}), + 'episode_id': ('radioEpisodeId', {str}), + 'categories': ('genre', ..., ['name1', 'name2'], {str}, all, {orderedSet}), + })), 'channel': station, 'uploader': station, - 'description': join_nonempty( - 'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta), - 'thumbnails': thumbnails, - **traverse_obj(full_meta, { - 'title': ('title', {str}), - 'timestamp': ('end_time', {unified_timestamp}), - 'release_timestamp': ('start_time', {unified_timestamp}), - }), }) def _extract_episode_info(self, episode, programme_id, series_meta): @@ -782,7 +863,9 @@ def _real_extract(self, url): site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline') programme_id = f'{site_id}_{corner_id}' - if site_id == 'F261': # XXX: News programmes use old API (for now?) + # XXX: News programmes use the old API + # Can't move this to NhkRadioNewsPageIE because news items still use the normal URL format + if site_id == '18439M2W42': meta = self._download_json( 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main'] series_meta = traverse_obj(meta, { @@ -843,8 +926,8 @@ class NhkRadioNewsPageIE(InfoExtractor): 'url': 'https://www.nhk.or.jp/radionews/', 'playlist_mincount': 5, 'info_dict': { - 'id': 'F261_01', - 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', + 'id': '18439M2W42_01', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/18439M2W42/img/series_945_thumbnail.jpg', 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d', 'channel': 'NHKラジオ第1', 'uploader': 'NHKラジオ第1', @@ -853,7 +936,7 @@ class NhkRadioNewsPageIE(InfoExtractor): }] def _real_extract(self, url): - return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE) + return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=18439M2W42_01', NhkRadiruIE) class NhkRadiruLiveIE(InfoExtractor): @@ -863,11 +946,12 @@ class NhkRadiruLiveIE(InfoExtractor): # radio 1, no area specified 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1', 'info_dict': { - 'id': 'r1-tokyo', - 'title': 're:^NHKネットラジオ第1 東京.+$', + 'id': 'bs-r1-130', + 'title': 're:^NHKラジオ第1・東京.+$', 'ext': 'm4a', - 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png', + 'thumbnail': 'https://www.nhk.jp/assets/images/broadcastservice/bs/r1/r1-logo.svg', 'live_status': 'is_live', + '_old_archive_ids': ['nhkradirulive r1-tokyo'], }, }, { # radio 2, area specified @@ -875,26 +959,28 @@ class NhkRadiruLiveIE(InfoExtractor): 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2', 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}}}, 'info_dict': { - 'id': 'r2-fukuoka', - 'title': 're:^NHKネットラジオ第2 福岡.+$', + 'id': 'bs-r2-400', + 'title': 're:^NHKラジオ第2.+$', 'ext': 'm4a', - 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png', + 'thumbnail': 'https://www.nhk.jp/assets/images/broadcastservice/bs/r2/r2-logo.svg', 'live_status': 'is_live', + '_old_archive_ids': ['nhkradirulive r2-fukuoka'], }, }, { # fm, area specified 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm', 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}}}, 'info_dict': { - 'id': 'fm-sapporo', - 'title': 're:^NHKネットラジオFM 札幌.+$', + 'id': 'bs-r3-010', + 'title': 're:^NHK FM・札幌.+$', 'ext': 'm4a', - 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png', + 'thumbnail': 'https://www.nhk.jp/assets/images/broadcastservice/bs/r3/r3-logo.svg', 'live_status': 'is_live', + '_old_archive_ids': ['nhkradirulive fm-sapporo'], }, }] - _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'} + _NOA_STATION_IDS = {'r1': 'r1', 'r2': 'r2', 'fm': 'r3'} def _real_extract(self, url): station = self._match_id(url) @@ -911,12 +997,15 @@ def _real_extract(self, url): noa_info = self._download_json( f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text), station, note=f'Downloading {area} station metadata', fatal=False) - present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present')) + broadcast_service = traverse_obj(noa_info, (self._NOA_STATION_IDS.get(station), 'publishedOn')) return { - 'title': ' '.join(traverse_obj(present_info, (('service', 'area'), 'name', {str}))), - 'id': join_nonempty(station, area), - 'thumbnails': traverse_obj(present_info, ('service', 'images', ..., { + **traverse_obj(broadcast_service, { + 'title': ('broadcastDisplayName', {str}), + 'id': ('id', {str}), + }), + '_old_archive_ids': [make_archive_id(self, join_nonempty(station, area))], + 'thumbnails': traverse_obj(broadcast_service, ('logo', ..., { 'url': 'url', 'width': ('width', {int_or_none}), 'height': ('height', {int_or_none}), From 99093e96fd6a26dea9d6e4bd1e4b16283b6ad1ee Mon Sep 17 00:00:00 2001 From: barsnick Date: Tue, 8 Jul 2025 06:18:15 +0200 Subject: [PATCH 04/11] [devscripts] Fix filename/directory Bash completions (#13620) Closes #13619 Authored by: barsnick --- devscripts/bash-completion.in | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index 21f52798ed..bb66c20956 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -10,9 +10,13 @@ __yt_dlp() diropts="--cache-dir" if [[ ${prev} =~ ${fileopts} ]]; then + local IFS=$'\n' + type compopt &>/dev/null && compopt -o filenames COMPREPLY=( $(compgen -f -- ${cur}) ) return 0 elif [[ ${prev} =~ ${diropts} ]]; then + local IFS=$'\n' + type compopt &>/dev/null && compopt -o dirnames COMPREPLY=( $(compgen -d -- ${cur}) ) return 0 fi From fd36b8f31bafbd8096bdb92a446a0c9c6081209c Mon Sep 17 00:00:00 2001 From: InvalidUsernameException Date: Tue, 8 Jul 2025 06:19:03 +0200 Subject: [PATCH 05/11] [test:download] Support `playlist_maxcount` (#13433) Authored by: InvalidUsernameException --- test/test_download.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 3f36869d9d..c7842735c2 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -14,6 +14,7 @@ from test.helper import ( assertGreaterEqual, + assertLessEqual, expect_info_dict, expect_warnings, get_params, @@ -121,10 +122,13 @@ def print_skipping(reason): params = get_params(test_case.get('params', {})) params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: - params.setdefault('extract_flat', 'in_playlist') - params.setdefault('playlistend', test_case.get( - 'playlist_mincount', test_case.get('playlist_count', -2) + 1)) + params.setdefault('playlistend', max( + test_case.get('playlist_mincount', -1), + test_case.get('playlist_count', -2) + 1, + test_case.get('playlist_maxcount', -2) + 1)) params.setdefault('skip_download', True) + if 'playlist_duration_sum' not in test_case: + params.setdefault('extract_flat', 'in_playlist') ydl = YoutubeDL(params, auto_init=False) ydl.add_default_info_extractors() @@ -159,6 +163,7 @@ def try_rm_tcs_files(tcs=None): try_rm(os.path.splitext(tc_filename)[0] + '.info.json') try_rm_tcs_files() try: + test_url = test_case['url'] try_num = 1 while True: try: @@ -166,7 +171,7 @@ def try_rm_tcs_files(tcs=None): # for outside error handling, and returns the exit code # instead of the result dict. res_dict = ydl.extract_info( - test_case['url'], + test_url, force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one @@ -194,23 +199,23 @@ def try_rm_tcs_files(tcs=None): self.assertTrue('entries' in res_dict) expect_info_dict(self, res_dict, test_case.get('info_dict', {})) + num_entries = len(res_dict.get('entries', [])) if 'playlist_mincount' in test_case: + mincount = test_case['playlist_mincount'] assertGreaterEqual( - self, - len(res_dict['entries']), - test_case['playlist_mincount'], - 'Expected at least %d in playlist %s, but got only %d' % ( - test_case['playlist_mincount'], test_case['url'], - len(res_dict['entries']))) + self, num_entries, mincount, + f'Expected at least {mincount} entries in playlist {test_url}, but got only {num_entries}') if 'playlist_count' in test_case: + count = test_case['playlist_count'] + got = num_entries if num_entries <= count else 'more' self.assertEqual( - len(res_dict['entries']), - test_case['playlist_count'], - 'Expected %d entries in playlist %s, but got %d.' % ( - test_case['playlist_count'], - test_case['url'], - len(res_dict['entries']), - )) + num_entries, count, + f'Expected exactly {count} entries in playlist {test_url}, but got {got}') + if 'playlist_maxcount' in test_case: + maxcount = test_case['playlist_maxcount'] + assertLessEqual( + self, num_entries, maxcount, + f'Expected at most {maxcount} entries in playlist {test_url}, but got more') if 'playlist_duration_sum' in test_case: got_duration = sum(e['duration'] for e in res_dict['entries']) self.assertEqual( From aa9f1f4d577e99897ac16cd19d4e217d688ea75d Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Wed, 9 Jul 2025 18:29:54 +1200 Subject: [PATCH 06/11] [ie/youtube] Log bad playability statuses of player responses (#13647) Authored by: coletdjnz --- yt_dlp/extractor/youtube/_video.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 8fa3b0a347..208abee937 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -3273,6 +3273,10 @@ def append_client(*client_names): # web_creator may work around age-verification for all videos but requires PO token append_client('tv_embedded', 'web_creator') + status = traverse_obj(pr, ('playabilityStatus', 'status', {str})) + if status not in ('OK', 'LIVE_STREAM_OFFLINE', 'AGE_CHECK_REQUIRED', 'AGE_VERIFICATION_REQUIRED'): + self.write_debug(f'{video_id}: {client} player response playability status: {status}') + prs.extend(deprioritized_prs) if skipped_clients: From 805519bfaa7cb5443912dfe45ac774834ba65a16 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 9 Jul 2025 15:45:47 -0500 Subject: [PATCH 07/11] [jsinterp] Fix undefined variable name caching (#13677) Fix b342d27f3f82d913976509ddf5bff539ad8567ec Authored by: bashonly --- test/test_jsinterp.py | 5 +++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 5 +++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index a1088cea49..43b1d0fdee 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -536,6 +536,11 @@ def test_nested_function_scoping(self): } ''', 31) + def test_undefined_varnames(self): + jsi = JSInterpreter('function f(){ var a; return [a, b]; }') + self._test(jsi, [JS_Undefined, JS_Undefined]) + self.assertEqual(jsi._undefined_varnames, {'b'}) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 98607df55e..4562467534 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -373,6 +373,10 @@ 'https://www.youtube.com/s/player/e12fbea4/player_ias_tce.vflset/en_US/base.js', 'kM5r52fugSZRAKHfo3', 'XkeRfXIPOkSwfg', ), + ( + 'https://www.youtube.com/s/player/ef259203/player_ias_tce.vflset/en_US/base.js', + 'rPqBC01nJpqhhi2iA2U', 'hY7dbiKFT51UIA', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index f06d96832f..460bc2c03e 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -677,8 +677,9 @@ def dict_item(key, val): # Set value as JS_Undefined or its pre-existing value local_vars.set_local(var, ret) else: - ret = local_vars.get(var, JS_Undefined) - if ret is JS_Undefined: + ret = local_vars.get(var, NO_DEFAULT) + if ret is NO_DEFAULT: + ret = JS_Undefined self._undefined_varnames.add(var) return ret, should_return From 0b359b184dee0c7052be482857bf562de67e4928 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 9 Jul 2025 16:58:19 -0500 Subject: [PATCH 08/11] [ie/9gag] Support browser impersonation (#13678) Closes #10837 Authored by: bashonly --- yt_dlp/extractor/ninegag.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/ninegag.py b/yt_dlp/extractor/ninegag.py index 2979f3a50e..1b88e9c544 100644 --- a/yt_dlp/extractor/ninegag.py +++ b/yt_dlp/extractor/ninegag.py @@ -1,6 +1,5 @@ from .common import InfoExtractor from ..utils import ( - ExtractorError, determine_ext, int_or_none, traverse_obj, @@ -61,10 +60,10 @@ def _real_extract(self, url): post = self._download_json( 'https://9gag.com/v1/post', post_id, query={ 'id': post_id, - })['data']['post'] + }, impersonate=True)['data']['post'] if post.get('type') != 'Animated': - raise ExtractorError( + self.raise_no_formats( 'The given url does not contain a video', expected=True) From 7b4c96e0898db048259ef5fdf12ed14e3605dce3 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Thu, 10 Jul 2025 01:16:33 +0300 Subject: [PATCH 09/11] [ie/mir24.tv] Add extractor (#13651) Closes #13365 Authored by: swayll --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/mir24tv.py | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 yt_dlp/extractor/mir24tv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ada12b3a8a..84da570b0a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1147,6 +1147,7 @@ MindsIE, ) from .minoto import MinotoIE +from .mir24tv import Mir24TvIE from .mirrativ import ( MirrativIE, MirrativUserIE, diff --git a/yt_dlp/extractor/mir24tv.py b/yt_dlp/extractor/mir24tv.py new file mode 100644 index 0000000000..5832901bf1 --- /dev/null +++ b/yt_dlp/extractor/mir24tv.py @@ -0,0 +1,37 @@ +from .common import InfoExtractor +from ..utils import parse_qs, url_or_none +from ..utils.traversal import require, traverse_obj + + +class Mir24TvIE(InfoExtractor): + IE_NAME = 'mir24.tv' + _VALID_URL = r'https?://(?:www\.)?mir24\.tv/news/(?P[0-9]+)/[^/?#]+' + _TESTS = [{ + 'url': 'https://mir24.tv/news/16635210/dni-kultury-rossii-otkrylis-v-uzbekistane.-na-prazdnichnom-koncerte-vystupili-zvezdy-rossijskoj-estrada', + 'info_dict': { + 'id': '16635210', + 'title': 'Дни культуры России открылись в Узбекистане. На праздничном концерте выступили звезды российской эстрады', + 'ext': 'mp4', + 'thumbnail': r're:https://images\.mir24\.tv/.+\.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, impersonate=True) + + iframe_url = self._search_regex( + r']+\bsrc=["\'](https?://mir24\.tv/players/[^"\']+)', + webpage, 'iframe URL') + + m3u8_url = traverse_obj(iframe_url, ( + {parse_qs}, 'source', -1, {self._proto_relative_url}, {url_or_none}, {require('m3u8 URL')})) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'formats': formats, + 'subtitles': subtitles, + } From 2aaf1aa71d174700859c9ec1a81109b78e34961c Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Thu, 10 Jul 2025 07:21:47 +0900 Subject: [PATCH 10/11] [ie/newspicks] Fix extractor (#13612) Closes #10472 Authored by: doe1080 --- yt_dlp/extractor/newspicks.py | 93 +++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 37 deletions(-) diff --git a/yt_dlp/extractor/newspicks.py b/yt_dlp/extractor/newspicks.py index 4a1cb0a735..5f19eed984 100644 --- a/yt_dlp/extractor/newspicks.py +++ b/yt_dlp/extractor/newspicks.py @@ -1,53 +1,72 @@ -import re - from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + clean_html, + parse_iso8601, + parse_qs, + url_or_none, +) +from ..utils.traversal import require, traverse_obj class NewsPicksIE(InfoExtractor): - _VALID_URL = r'https?://newspicks\.com/movie-series/(?P\d+)\?movieId=(?P\d+)' - + _VALID_URL = r'https?://newspicks\.com/movie-series/(?P[^?/#]+)' _TESTS = [{ - 'url': 'https://newspicks.com/movie-series/11?movieId=1813', + 'url': 'https://newspicks.com/movie-series/11/?movieId=1813', 'info_dict': { 'id': '1813', - 'title': '日本の課題を破壊せよ【ゲスト:成田悠輔】', - 'description': 'md5:09397aad46d6ded6487ff13f138acadf', - 'channel': 'HORIE ONE', - 'channel_id': '11', - 'release_date': '20220117', - 'thumbnail': r're:https://.+jpg', 'ext': 'mp4', + 'title': '日本の課題を破壊せよ【ゲスト:成田悠輔】', + 'cast': 'count:4', + 'description': 'md5:09397aad46d6ded6487ff13f138acadf', + 'duration': 2940, + 'release_date': '20220117', + 'release_timestamp': 1642424400, + 'series': 'HORIE ONE', + 'series_id': '11', + 'thumbnail': r're:https?://resources\.newspicks\.com/.+\.(?:jpe?g|png)', + 'timestamp': 1642424420, + 'upload_date': '20220117', + }, + }, { + 'url': 'https://newspicks.com/movie-series/158/?movieId=3932', + 'info_dict': { + 'id': '3932', + 'ext': 'mp4', + 'title': '【検証】専門家は、KADOKAWAをどう見るか', + 'cast': 'count:3', + 'description': 'md5:2c2d4bf77484a4333ec995d676f9a91d', + 'duration': 1320, + 'release_date': '20240622', + 'release_timestamp': 1719088080, + 'series': 'NPレポート', + 'series_id': '158', + 'thumbnail': r're:https?://resources\.newspicks\.com/.+\.(?:jpe?g|png)', + 'timestamp': 1719086400, + 'upload_date': '20240622', }, }] def _real_extract(self, url): - video_id, channel_id = self._match_valid_url(url).group('id', 'channel_id') + series_id = self._match_id(url) + video_id = traverse_obj(parse_qs(url), ('movieId', -1, {str}, {require('movie ID')})) webpage = self._download_webpage(url, video_id) - entries = self._parse_html5_media_entries( - url, webpage.replace('movie-for-pc', 'movie'), video_id, 'hls') - if not entries: - raise ExtractorError('No HTML5 media elements found') - info = entries[0] - title = self._html_search_meta('og:title', webpage, fatal=False) - description = self._html_search_meta( - ('og:description', 'twitter:title'), webpage, fatal=False) - channel = self._html_search_regex( - r'value="11".+?(.+?)\s*(\d+)年(\d+)月(\d+)日\s*', - webpage, 'release date', fatal=False, group=(1, 2, 3)) - - info.update({ + return { 'id': video_id, - 'title': title, - 'description': description, - 'channel': channel, - 'channel_id': channel_id, - 'release_date': ('%04d%02d%02d' % tuple(map(int, release_date))) if release_date else None, - }) - return info + 'formats': formats, + 'series': traverse_obj(fragment, ('series', 'title', {str})), + 'series_id': series_id, + 'subtitles': subtitles, + **traverse_obj(fragment, ('movie', { + 'title': ('title', {str}), + 'cast': ('relatedUsers', ..., 'displayName', {str}, filter, all, filter), + 'description': ('explanation', {clean_html}), + 'release_timestamp': ('onAirStartDate', {parse_iso8601}), + 'thumbnail': (('image', 'coverImageUrl'), {url_or_none}, any), + 'timestamp': ('published', {parse_iso8601}), + })), + } From 5b57b72c1a7c6bd249ffcebdf5630761ec664c10 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Fri, 11 Jul 2025 18:54:01 +1200 Subject: [PATCH 11/11] [ie/youtube] Do not require PO Token for premium accounts (#13640) Authored by: coletdjnz --- README.md | 1 + yt_dlp/extractor/youtube/_base.py | 157 +++++++++++++- yt_dlp/extractor/youtube/_video.py | 318 +++++++++++++++++------------ 3 files changed, 335 insertions(+), 141 deletions(-) diff --git a/README.md b/README.md index e476c0084b..c1a9356923 100644 --- a/README.md +++ b/README.md @@ -1799,6 +1799,7 @@ #### youtube * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively * `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv`, `tv_simply` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details +* `webpage_skip`: Skip extraction of embedded webpage data. One or both of `player_response`, `initial_data`. These options are for testing purposes and don't skip any network requests * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `player_js_variant`: The player javascript variant to use for signature and nsig deciphering. The known variants are: `main`, `tce`, `tv`, `tv_es6`, `phone`, `tablet`. Only `main` is recommended as a possible workaround; the others are for debugging purposes. The default is to use what is prescribed by the site, and can be selected with `actual` * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index 5aee89b917..7d9cbf8ee4 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -1,5 +1,6 @@ import calendar import copy +import dataclasses import datetime as dt import enum import functools @@ -38,6 +39,60 @@ class _PoTokenContext(enum.Enum): SUBS = 'subs' +class StreamingProtocol(enum.Enum): + HTTPS = 'https' + DASH = 'dash' + HLS = 'hls' + + +@dataclasses.dataclass +class BasePoTokenPolicy: + required: bool = False + # Try to fetch a PO Token even if it is not required. + recommended: bool = False + not_required_for_premium: bool = False + + +@dataclasses.dataclass +class GvsPoTokenPolicy(BasePoTokenPolicy): + not_required_with_player_token: bool = False + + +@dataclasses.dataclass +class PlayerPoTokenPolicy(BasePoTokenPolicy): + pass + + +@dataclasses.dataclass +class SubsPoTokenPolicy(BasePoTokenPolicy): + pass + + +WEB_PO_TOKEN_POLICIES = { + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + ), + }, + 'PLAYER_PO_TOKEN_POLICY': PlayerPoTokenPolicy(required=False), + # In rollout, currently detected via experiment + # Premium users DO require a PO Token for subtitles + 'SUBS_PO_TOKEN_POLICY': SubsPoTokenPolicy(required=False), +} + # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -48,8 +103,8 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'SUPPORTS_COOKIES': True, + **WEB_PO_TOKEN_POLICIES, }, # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats 'web_safari': { @@ -61,8 +116,8 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'SUPPORTS_COOKIES': True, + **WEB_PO_TOKEN_POLICIES, 'PLAYER_PARAMS': '8AEB', }, 'web_embedded': { @@ -84,7 +139,24 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + ), + }, 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video @@ -96,7 +168,24 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + ), + }, 'REQUIRE_AUTH': True, 'SUPPORTS_COOKIES': True, }, @@ -113,7 +202,24 @@ class _PoTokenContext(enum.Enum): }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_with_player_token=True, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_with_player_token=True, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + not_required_with_player_token=True, + ), + }, + 'PLAYER_PO_TOKEN_POLICY': PlayerPoTokenPolicy(required=False, recommended=True), }, # YouTube Kids videos aren't returned on this client for some reason 'android_vr': { @@ -147,7 +253,21 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_with_player_token=True, + ), + # HLS Livestreams require POT 30 seconds in + # TODO: Rolling out + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + not_required_with_player_token=True, + ), + }, + 'PLAYER_PO_TOKEN_POLICY': PlayerPoTokenPolicy(required=False, recommended=True), 'REQUIRE_JS_PLAYER': False, }, # mweb has 'ultralow' formats @@ -162,7 +282,24 @@ class _PoTokenContext(enum.Enum): }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'GVS_PO_TOKEN_POLICY': { + StreamingProtocol.HTTPS: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.DASH: GvsPoTokenPolicy( + required=True, + recommended=True, + not_required_for_premium=True, + not_required_with_player_token=False, + ), + StreamingProtocol.HLS: GvsPoTokenPolicy( + required=False, + recommended=True, + ), + }, 'SUPPORTS_COOKIES': True, }, 'tv': { @@ -226,7 +363,11 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) - ytcfg.setdefault('PO_TOKEN_REQUIRED_CONTEXTS', []) + ytcfg.setdefault('GVS_PO_TOKEN_POLICY', {}) + for protocol in StreamingProtocol: + ytcfg['GVS_PO_TOKEN_POLICY'].setdefault(protocol, GvsPoTokenPolicy()) + ytcfg.setdefault('PLAYER_PO_TOKEN_POLICY', PlayerPoTokenPolicy()) + ytcfg.setdefault('SUBS_PO_TOKEN_POLICY', SubsPoTokenPolicy()) ytcfg.setdefault('REQUIRE_AUTH', False) ytcfg.setdefault('SUPPORTS_COOKIES', False) ytcfg.setdefault('PLAYER_PARAMS', None) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 208abee937..fc1f087ace 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -18,6 +18,9 @@ from ._base import ( INNERTUBE_CLIENTS, BadgeType, + GvsPoTokenPolicy, + PlayerPoTokenPolicy, + StreamingProtocol, YoutubeBaseInfoExtractor, _PoTokenContext, _split_innertube_client, @@ -71,9 +74,11 @@ from ...utils.networking import clean_headers, clean_proxies, select_proxy STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' -STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token' STREAMING_DATA_FETCH_SUBS_PO_TOKEN = '__yt_dlp_fetch_subs_po_token' +STREAMING_DATA_FETCH_GVS_PO_TOKEN = '__yt_dlp_fetch_gvs_po_token' +STREAMING_DATA_PLAYER_TOKEN_PROVIDED = '__yt_dlp_player_token_provided' STREAMING_DATA_INNERTUBE_CONTEXT = '__yt_dlp_innertube_context' +STREAMING_DATA_IS_PREMIUM_SUBSCRIBER = '__yt_dlp_is_premium_subscriber' PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide' @@ -253,6 +258,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'srt', 'vtt') _DEFAULT_CLIENTS = ('tv', 'ios', 'web') _DEFAULT_AUTHED_CLIENTS = ('tv', 'web') + # Premium does not require POT (except for subtitles) + _DEFAULT_PREMIUM_CLIENTS = ('tv', 'web') _GEO_BYPASS = False @@ -1833,7 +1840,8 @@ def refetch_manifest(format_id, delay): if time.time() <= start_time + delay: return - _, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) + _, _, _, _, prs, player_url = self._initial_extract( + url, smuggled_data, webpage_url, 'web', video_id) video_details = traverse_obj(prs, (..., 'videoDetails'), expected_type=dict) microformats = traverse_obj( prs, (..., 'microformat', 'playerMicroformatRenderer'), @@ -2891,7 +2899,7 @@ def _get_config_po_token(self, client: str, context: _PoTokenContext): only_once=True) continue - def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None, + def fetch_po_token(self, client='web', context: _PoTokenContext = _PoTokenContext.GVS, ytcfg=None, visitor_data=None, data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, required=False, **kwargs): """ @@ -2976,7 +2984,6 @@ def _fetch_po_token(self, client, **kwargs): fetch_pot_policy == 'never' or ( fetch_pot_policy == 'auto' - and _PoTokenContext(context) not in self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] and not kwargs.get('required', False) ) ): @@ -3035,19 +3042,19 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): + def _extract_player_response(self, client, video_id, webpage_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): headers = self.generate_api_headers( ytcfg=player_ytcfg, default_client=client, visitor_data=visitor_data, - session_index=self._extract_session_index(master_ytcfg, player_ytcfg), + session_index=self._extract_session_index(webpage_ytcfg, player_ytcfg), delegated_session_id=( self._parse_data_sync_id(data_sync_id)[0] - or self._extract_delegated_session_id(master_ytcfg, initial_pr, player_ytcfg) + or self._extract_delegated_session_id(webpage_ytcfg, initial_pr, player_ytcfg) ), user_session_id=( self._parse_data_sync_id(data_sync_id)[1] - or self._extract_user_session_id(master_ytcfg, initial_pr, player_ytcfg) + or self._extract_user_session_id(webpage_ytcfg, initial_pr, player_ytcfg) ), ) @@ -3063,7 +3070,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, if po_token: yt_query['serviceIntegrityDimensions'] = {'poToken': po_token} - sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None + sts = self._extract_signature_timestamp(video_id, player_url, webpage_ytcfg, fatal=False) if player_url else None yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3072,10 +3079,14 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, note='Downloading {} player API JSON'.format(client.replace('_', ' ').strip()), ) or None - def _get_requested_clients(self, url, smuggled_data): + def _get_requested_clients(self, url, smuggled_data, is_premium_subscriber): requested_clients = [] excluded_clients = [] - default_clients = self._DEFAULT_AUTHED_CLIENTS if self.is_authenticated else self._DEFAULT_CLIENTS + default_clients = ( + self._DEFAULT_PREMIUM_CLIENTS if is_premium_subscriber + else self._DEFAULT_AUTHED_CLIENTS if self.is_authenticated + else self._DEFAULT_CLIENTS + ) allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) @@ -3117,11 +3128,12 @@ def _invalid_player_response(self, pr, video_id): if (pr_id := traverse_obj(pr, ('videoDetails', 'videoId'))) != video_id: return pr_id - def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): + def _extract_player_responses(self, clients, video_id, webpage, webpage_client, webpage_ytcfg, is_premium_subscriber): initial_pr = None if webpage: initial_pr = self._search_json( - self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) + self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, + f'{webpage_client} client initial player response', video_id, fatal=False) prs = [] deprioritized_prs = [] @@ -3152,11 +3164,11 @@ def append_client(*client_names): while clients: deprioritize_pr = False client, base_client, variant = _split_innertube_client(clients.pop()) - player_ytcfg = master_ytcfg if client == 'web' else {} - if 'configs' not in self._configuration_arg('player_skip') and client != 'web': + player_ytcfg = webpage_ytcfg if client == webpage_client else {} + if 'configs' not in self._configuration_arg('player_skip') and client != webpage_client: player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg - player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) + player_url = player_url or self._extract_player_url(webpage_ytcfg, player_ytcfg, webpage=webpage) require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER') if 'js' in self._configuration_arg('player_skip'): require_js_player = False @@ -3166,10 +3178,12 @@ def append_client(*client_names): player_url = self._download_player_url(video_id) tried_iframe_fallback = True - pr = initial_pr if client == 'web' else None + pr = None + if client == webpage_client and 'player_response' not in self._configuration_arg('webpage_skip'): + pr = initial_pr - visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) - data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) + visitor_data = visitor_data or self._extract_visitor_data(webpage_ytcfg, initial_pr, player_ytcfg) + data_sync_id = data_sync_id or self._extract_data_sync_id(webpage_ytcfg, initial_pr, player_ytcfg) fetch_po_token_args = { 'client': client, @@ -3178,53 +3192,26 @@ def append_client(*client_names): 'data_sync_id': data_sync_id if self.is_authenticated else None, 'player_url': player_url if require_js_player else None, 'webpage': webpage, - 'session_index': self._extract_session_index(master_ytcfg, player_ytcfg), + 'session_index': self._extract_session_index(webpage_ytcfg, player_ytcfg), 'ytcfg': player_ytcfg or self._get_default_ytcfg(client), } # Don't need a player PO token for WEB if using player response from webpage + player_pot_policy: PlayerPoTokenPolicy = self._get_default_ytcfg(client)['PLAYER_PO_TOKEN_POLICY'] player_po_token = None if pr else self.fetch_po_token( - context=_PoTokenContext.PLAYER, **fetch_po_token_args) + context=_PoTokenContext.PLAYER, **fetch_po_token_args, + required=player_pot_policy.required or player_pot_policy.recommended) - gvs_po_token = self.fetch_po_token( - context=_PoTokenContext.GVS, **fetch_po_token_args) + fetch_gvs_po_token_func = functools.partial( + self.fetch_po_token, context=_PoTokenContext.GVS, **fetch_po_token_args) fetch_subs_po_token_func = functools.partial( - self.fetch_po_token, - context=_PoTokenContext.SUBS, - **fetch_po_token_args, - ) - - required_pot_contexts = self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] - - if ( - not player_po_token - and _PoTokenContext.PLAYER in required_pot_contexts - ): - # TODO: may need to skip player response request. Unsure yet.. - self.report_warning( - f'No Player PO Token provided for {client} client, ' - f'which may be required for working {client} formats. This client will be deprioritized' - f'You can manually pass a Player PO Token for this client with --extractor-args "youtube:po_token={client}.player+XXX". ' - f'For more information, refer to {PO_TOKEN_GUIDE_URL} .', only_once=True) - deprioritize_pr = True - - if ( - not gvs_po_token - and _PoTokenContext.GVS in required_pot_contexts - and 'missing_pot' in self._configuration_arg('formats') - ): - # note: warning with help message is provided later during format processing - self.report_warning( - f'No GVS PO Token provided for {client} client, ' - f'which may be required for working {client} formats. This client will be deprioritized', - only_once=True) - deprioritize_pr = True + self.fetch_po_token, context=_PoTokenContext.SUBS, **fetch_po_token_args) try: pr = pr or self._extract_player_response( client, video_id, - master_ytcfg=player_ytcfg or master_ytcfg, + webpage_ytcfg=player_ytcfg or webpage_ytcfg, player_ytcfg=player_ytcfg, player_url=player_url, initial_pr=initial_pr, @@ -3242,12 +3229,16 @@ def append_client(*client_names): innertube_context = traverse_obj(player_ytcfg or self._get_default_ytcfg(client), 'INNERTUBE_CONTEXT') sd = pr.setdefault('streamingData', {}) sd[STREAMING_DATA_CLIENT_NAME] = client - sd[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token + sd[STREAMING_DATA_FETCH_GVS_PO_TOKEN] = fetch_gvs_po_token_func + sd[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] = bool(player_po_token) sd[STREAMING_DATA_INNERTUBE_CONTEXT] = innertube_context sd[STREAMING_DATA_FETCH_SUBS_PO_TOKEN] = fetch_subs_po_token_func + sd[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] = is_premium_subscriber for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): f[STREAMING_DATA_CLIENT_NAME] = client - f[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token + f[STREAMING_DATA_FETCH_GVS_PO_TOKEN] = fetch_gvs_po_token_func + f[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] = is_premium_subscriber + f[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] = bool(player_po_token) if deprioritize_pr: deprioritized_prs.append(pr) else: @@ -3357,6 +3348,15 @@ def build_fragments(f): }), } for range_start in range(0, f['filesize'], CHUNK_SIZE)) + def gvs_pot_required(policy, is_premium_subscriber, has_player_token): + return ( + policy.required + and not (policy.not_required_with_player_token and has_player_token) + and not (policy.not_required_for_premium and is_premium_subscriber)) + + # save pots per client to avoid fetching again + gvs_pots = {} + for fmt in streaming_formats: client_name = fmt[STREAMING_DATA_CLIENT_NAME] if fmt.get('targetDurationSec'): @@ -3416,7 +3416,7 @@ def build_fragments(f): encrypted_sig = try_get(sc, lambda x: x['s'][0]) if not all((sc, fmt_url, player_url, encrypted_sig)): msg = f'Some {client_name} client https formats have been skipped as they are missing a url. ' - if client_name == 'web': + if client_name in ('web', 'web_safari'): msg += 'YouTube is forcing SABR streaming for this client. ' else: msg += ( @@ -3476,18 +3476,25 @@ def build_fragments(f): self.report_warning( 'Some formats are possibly damaged. They will be deprioritized', video_id, only_once=True) - po_token = fmt.get(STREAMING_DATA_INITIAL_PO_TOKEN) + fetch_po_token_func = fmt[STREAMING_DATA_FETCH_GVS_PO_TOKEN] + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg(client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.HTTPS] + + require_po_token = ( + itag not in ['18'] + and gvs_pot_required( + pot_policy, fmt[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER], + fmt[STREAMING_DATA_PLAYER_TOKEN_PROVIDED])) + + po_token = ( + gvs_pots.get(client_name) + or fetch_po_token_func(required=require_po_token or pot_policy.recommended)) if po_token: fmt_url = update_url_query(fmt_url, {'pot': po_token}) + if client_name not in gvs_pots: + gvs_pots[client_name] = po_token - # Clients that require PO Token return videoplayback URLs that may return 403 - require_po_token = ( - not po_token - and _PoTokenContext.GVS in self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] - and itag not in ['18']) # these formats do not require PO Token - - if require_po_token and 'missing_pot' not in self._configuration_arg('formats'): + if not po_token and require_po_token and 'missing_pot' not in self._configuration_arg('formats'): self._report_pot_format_skipped(video_id, client_name, 'https') continue @@ -3502,7 +3509,7 @@ def build_fragments(f): name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - is_damaged and 'DAMAGED', require_po_token and 'MISSING POT', + is_damaged and 'DAMAGED', require_po_token and not po_token and 'MISSING POT', (self.get_param('verbose') or all_formats) and short_client_name(client_name), delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 @@ -3565,7 +3572,7 @@ def build_fragments(f): elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') - def process_manifest_format(f, proto, client_name, itag, po_token): + def process_manifest_format(f, proto, client_name, itag, missing_pot): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False @@ -3573,20 +3580,11 @@ def process_manifest_format(f, proto, client_name, itag, po_token): if f.get('source_preference') is None: f['source_preference'] = -1 - # Clients that require PO Token return videoplayback URLs that may return 403 - # hls does not currently require PO Token - if ( - not po_token - and _PoTokenContext.GVS in self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] - and proto != 'hls' - ): - if 'missing_pot' not in self._configuration_arg('formats'): - self._report_pot_format_skipped(video_id, client_name, proto) - return False + if missing_pot: f['format_note'] = join_nonempty(f.get('format_note'), 'MISSING POT', delim=' ') f['source_preference'] -= 20 - # XXX: Check if IOS HLS formats are affected by player PO token enforcement; temporary + # XXX: Check if IOS HLS formats are affected by PO token enforcement; temporary # See https://github.com/yt-dlp/yt-dlp/issues/13511 if proto == 'hls' and client_name == 'ios': f['__needs_testing'] = True @@ -3625,39 +3623,62 @@ def process_manifest_format(f, proto, client_name, itag, po_token): subtitles = {} for sd in streaming_data: client_name = sd[STREAMING_DATA_CLIENT_NAME] - po_token = sd.get(STREAMING_DATA_INITIAL_PO_TOKEN) + fetch_pot_func = sd[STREAMING_DATA_FETCH_GVS_PO_TOKEN] + is_premium_subscriber = sd[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] + has_player_token = sd[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] + hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg( + client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.HLS] + require_po_token = gvs_pot_required(pot_policy, is_premium_subscriber, has_player_token) + po_token = gvs_pots.get(client_name, fetch_pot_func(required=require_po_token or pot_policy.recommended)) if po_token: hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' - fmts, subs = self._extract_m3u8_formats_and_subtitles( - hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') - for sub in traverse_obj(subs, (..., ..., {dict})): - # HLS subs (m3u8) do not need a PO token; save client name for debugging - sub[STREAMING_DATA_CLIENT_NAME] = client_name - subtitles = self._merge_subtitles(subs, subtitles) - for f in fmts: - if process_manifest_format(f, 'hls', client_name, self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None), po_token): - yield f + if client_name not in gvs_pots: + gvs_pots[client_name] = po_token + if require_po_token and not po_token and 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, 'hls') + else: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') + for sub in traverse_obj(subs, (..., ..., {dict})): + # TODO: If HLS video requires a PO Token, do the subs also require pot? + # Save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name + subtitles = self._merge_subtitles(subs, subtitles) + for f in fmts: + if process_manifest_format(f, 'hls', client_name, self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None), require_po_token and not po_token): + yield f dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg( + client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.DASH] + require_po_token = gvs_pot_required(pot_policy, is_premium_subscriber, has_player_token) + po_token = gvs_pots.get(client_name, fetch_pot_func(required=require_po_token or pot_policy.recommended)) if po_token: dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' - formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) - for sub in traverse_obj(subs, (..., ..., {dict})): - # TODO: Investigate if DASH subs ever need a PO token; save client name for debugging - sub[STREAMING_DATA_CLIENT_NAME] = client_name - subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH - for f in formats: - if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): - f['filesize'] = int_or_none(self._search_regex( - r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) - if needs_live_processing: - f['is_from_start'] = True + if client_name not in gvs_pots: + gvs_pots[client_name] = po_token + if require_po_token and not po_token and 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, 'dash') + else: + formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) + for sub in traverse_obj(subs, (..., ..., {dict})): + # TODO: If DASH video requires a PO Token, do the subs also require pot? + # Save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name + subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH + for f in formats: + if process_manifest_format(f, 'dash', client_name, f['format_id'], require_po_token and not po_token): + f['filesize'] = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) + if needs_live_processing: + f['is_from_start'] = True - yield f + yield f yield subtitles def _extract_storyboard(self, player_responses, duration): @@ -3698,22 +3719,22 @@ def _extract_storyboard(self, player_responses, duration): } for j in range(math.ceil(fragment_count))], } - def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): + def _download_initial_webpage(self, webpage_url, webpage_client, video_id): webpage = None - if 'webpage' not in self._configuration_arg('player_skip'): + if webpage_url and 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} - pp = self._configuration_arg('player_params', [None], casesense=True)[0] + pp = ( + self._configuration_arg('player_params', [None], casesense=True)[0] + or traverse_obj(INNERTUBE_CLIENTS, (webpage_client, 'PLAYER_PARAMS', {str})) + ) if pp: query['pp'] = pp - webpage = self._download_webpage_with_retries(webpage_url, video_id, query=query) - - master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() - - player_responses, player_url = self._extract_player_responses( - self._get_requested_clients(url, smuggled_data), - video_id, webpage, master_ytcfg, smuggled_data) - - return webpage, master_ytcfg, player_responses, player_url + webpage = self._download_webpage_with_retries( + webpage_url, video_id, query=query, + headers=traverse_obj(self._get_default_ytcfg(webpage_client), { + 'User-Agent': ('INNERTUBE_CONTEXT', 'client', 'userAgent', {str}), + })) + return webpage def _list_formats(self, video_id, microformats, video_details, player_responses, player_url, duration=None): live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) @@ -3738,14 +3759,60 @@ def _list_formats(self, video_id, microformats, video_details, player_responses, return live_broadcast_details, live_status, streaming_data, formats, subtitles + def _download_initial_data(self, video_id, webpage, webpage_client, webpage_ytcfg): + initial_data = None + if webpage and 'initial_data' not in self._configuration_arg('webpage_skip'): + initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False) + if not traverse_obj(initial_data, 'contents'): + self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') + initial_data = None + if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): + query = {'videoId': video_id} + query.update(self._get_checkok_params()) + initial_data = self._extract_response( + item_id=video_id, ep='next', fatal=False, + ytcfg=webpage_ytcfg, query=query, check_get_keys='contents', + note='Downloading initial data API JSON', default_client=webpage_client) + return initial_data + + def _is_premium_subscriber(self, initial_data): + if not self.is_authenticated or not initial_data: + return False + + tlr = traverse_obj( + initial_data, ('topbar', 'desktopTopbarRenderer', 'logo', 'topbarLogoRenderer')) + return ( + traverse_obj(tlr, ('iconImage', 'iconType')) == 'YOUTUBE_PREMIUM_LOGO' + or 'premium' in (self._get_text(tlr, 'tooltipText') or '').lower() + ) + + def _initial_extract(self, url, smuggled_data, webpage_url, webpage_client, video_id): + # This function is also used by live-from-start refresh + webpage = self._download_initial_webpage(webpage_url, webpage_client, video_id) + webpage_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg(webpage_client) + + initial_data = self._download_initial_data(video_id, webpage, webpage_client, webpage_ytcfg) + + is_premium_subscriber = self._is_premium_subscriber(initial_data) + if is_premium_subscriber: + self.write_debug('Detected YouTube Premium subscription') + + player_responses, player_url = self._extract_player_responses( + self._get_requested_clients(url, smuggled_data, is_premium_subscriber), + video_id, webpage, webpage_client, webpage_ytcfg, is_premium_subscriber) + + return webpage, webpage_ytcfg, initial_data, is_premium_subscriber, player_responses, player_url + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id + webpage_client = 'web' - webpage, master_ytcfg, player_responses, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) + webpage, webpage_ytcfg, initial_data, is_premium_subscriber, player_responses, player_url = self._initial_extract( + url, smuggled_data, webpage_url, webpage_client, video_id) playability_statuses = traverse_obj( player_responses, (..., 'playabilityStatus'), expected_type=dict) @@ -4020,7 +4087,7 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer pctr = pr['captions']['playerCaptionsTracklistRenderer'] client_name = pr['streamingData'][STREAMING_DATA_CLIENT_NAME] innertube_client_name = pr['streamingData'][STREAMING_DATA_INNERTUBE_CONTEXT]['client']['clientName'] - required_contexts = self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg(client_name)['SUBS_PO_TOKEN_POLICY'] fetch_subs_po_token_func = pr['streamingData'][STREAMING_DATA_FETCH_SUBS_PO_TOKEN] pot_params = {} @@ -4033,11 +4100,11 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer requires_pot = ( # We can detect the experiment for now any(e in traverse_obj(qs, ('exp', ...)) for e in ('xpe', 'xpv')) - or _PoTokenContext.SUBS in required_contexts) + or (pot_policy.required and not (pot_policy.not_required_for_premium and is_premium_subscriber))) if not already_fetched_pot: already_fetched_pot = True - if subs_po_token := fetch_subs_po_token_func(required=requires_pot): + if subs_po_token := fetch_subs_po_token_func(required=requires_pot or pot_policy.recommended): pot_params.update({ 'pot': subs_po_token, 'potc': '1', @@ -4140,21 +4207,6 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer 'release_year': int_or_none(release_year), }) - initial_data = None - if webpage: - initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False) - if not traverse_obj(initial_data, 'contents'): - self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') - initial_data = None - if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): - query = {'videoId': video_id} - query.update(self._get_checkok_params()) - initial_data = self._extract_response( - item_id=video_id, ep='next', fatal=False, - ytcfg=master_ytcfg, query=query, check_get_keys='contents', - headers=self.generate_api_headers(ytcfg=master_ytcfg), - note='Downloading initial data API JSON') - COMMENTS_SECTION_IDS = ('comment-item-section', 'engagement-panel-comments-section') info['comment_count'] = traverse_obj(initial_data, ( 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer', @@ -4353,7 +4405,7 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or get_first(microformats, 'isUnlisted', expected_type=bool)))) - info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage) + info['__post_extractor'] = self.extract_comments(webpage_ytcfg, video_id, contents, webpage) self.mark_watched(video_id, player_responses)