diff --git a/README.md b/README.md index c0329f539..0cc2cd7b2 100644 --- a/README.md +++ b/README.md @@ -1770,7 +1770,7 @@ #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively * `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` -* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details +* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` diff --git a/test/helper.py b/test/helper.py index 4169af799..e4cb478e2 100644 --- a/test/helper.py +++ b/test/helper.py @@ -136,7 +136,7 @@ def _iter_differences(got, expected, field): return if op == 'startswith': - if not val.startswith(got): + if not got.startswith(val): yield field, f'should start with {val!r}, got {got!r}' return diff --git a/test/test_networking.py b/test/test_networking.py index 3ab60fe83..2f441fced 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -39,6 +39,7 @@ from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.networking import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, @@ -1856,6 +1857,7 @@ def test_method(self): def test_request_helpers(self): assert HEADRequest('http://example.com').method == 'HEAD' + assert PATCHRequest('http://example.com').method == 'PATCH' assert PUTRequest('http://example.com').method == 'PUT' def test_headers(self): diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e8e0fdcdd..618e300f1 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1783,7 +1783,6 @@ from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, - RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE, ) @@ -2237,7 +2236,11 @@ TVPlayIE, ) from .tvplayer import TVPlayerIE -from .tvw import TvwIE, TvwNewsIE +from .tvw import ( + TvwIE, + TvwNewsIE, + TvwTvChannelsIE, +) from .tweakers import TweakersIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 8c7131b10..8f2fc4c80 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -21,6 +21,7 @@ int_or_none, time_seconds, traverse_obj, + update_url, update_url_query, ) @@ -417,6 +418,10 @@ def _real_extract(self, url): 'is_live': is_live, 'availability': availability, }) + + if thumbnail := update_url(self._og_search_thumbnail(webpage, default=''), query=None): + info['thumbnails'] = [{'url': thumbnail}] + return info diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 0fe95bec5..1258a5704 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -1,64 +1,105 @@ +import urllib.parse + from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + parse_age_limit, + url_or_none, urlencode_postdata, ) +from ..utils.traversal import traverse_obj class AtresPlayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P.+?)_(?P[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/(?:[^/?#]+/){4}(?P.+?)_(?P[0-9a-f]{24})' _NETRC_MACHINE = 'atresplayer' - _TESTS = [ - { - 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', - 'info_dict': { - 'id': '5d4aa2c57ed1a88fc715a615', - 'ext': 'mp4', - 'title': 'Capítulo 7: Asuntos pendientes', - 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', - 'duration': 3413, - }, - 'skip': 'This video is only available for registered users', + _TESTS = [{ + 'url': 'https://www.atresplayer.com/lasexta/programas/el-objetivo/clips/mbappe-describe-como-entrenador-a-carlo-ancelotti-sabe-cuando-tiene-que-ser-padre-jefe-amigo-entrenador_67f2dfb2fb6ab0e4c7203849/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f2dfb2fb6ab0e4c7203849', + 'display_id': 'md5:c203f8d4e425ed115ba56a1c6e4b3e6c', + 'title': 'Mbappé describe como entrenador a Carlo Ancelotti: "Sabe cuándo tiene que ser padre, jefe, amigo, entrenador..."', + 'channel': 'laSexta', + 'duration': 31, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/06/B02DBE1E-D59B-4683-8404-1A9595D15269/1920x1080.jpg', + 'tags': ['Entrevista informativa', 'Actualidad', 'Debate informativo', 'Política', 'Economía', 'Sociedad', 'Cara a cara', 'Análisis', 'Más periodismo'], + 'series': 'El Objetivo', + 'season': 'Temporada 12', + 'timestamp': 1743970079, + 'upload_date': '20250406', }, - { - 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/programas/el-hormiguero/clips/revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero_67f836baa4a5b0e4147ca59a/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f836baa4a5b0e4147ca59a', + 'display_id': 'revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero', + 'title': 'Revive la entrevista completa a Miguel Bosé en El Hormiguero', + 'description': 'md5:c6d2b591408d45a7bc2986dfb938eb72', + 'channel': 'Antena 3', + 'duration': 2556, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/10/9076395F-F1FD-48BE-9F18-540DBA10EBAD/1920x1080.jpg', + 'tags': ['Entrevista', 'Variedades', 'Humor', 'Entretenimiento', 'Te sigo', 'Buen rollo', 'Cara a cara'], + 'series': 'El Hormiguero ', + 'season': 'Temporada 14', + 'timestamp': 1744320111, + 'upload_date': '20250410', }, - { - 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/flooxer/series/biara-proyecto-lazarus/temporada-1/capitulo-3-supervivientes_67a6038b64ceca00070f4f69/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67a6038b64ceca00070f4f69', + 'display_id': 'capitulo-3-supervivientes', + 'title': 'Capítulo 3: Supervivientes', + 'description': 'md5:65b231f20302f776c2b0dd24594599a1', + 'channel': 'Flooxer', + 'duration': 1196, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages01/2025/02/14/17CF90D3-FE67-40C5-A941-7825B3E13992/1920x1080.jpg', + 'tags': ['Juvenil', 'Terror', 'Piel de gallina', 'Te sigo', 'Un break', 'Del tirón'], + 'series': 'BIARA: Proyecto Lázarus', + 'season': 'Temporada 1', + 'season_number': 1, + 'episode': 'Episode 3', + 'episode_number': 3, + 'timestamp': 1743095191, + 'upload_date': '20250327', }, - ] + }, { + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', + 'only_matching': True, + }] _API_BASE = 'https://api.atresplayer.com/' def _perform_login(self, username, password): - self._request_webpage( - self._API_BASE + 'login', None, 'Downloading login page') - try: - target_url = self._download_json( - 'https://account.atresmedia.com/api/login', None, - 'Logging in', headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=urlencode_postdata({ + self._download_webpage( + 'https://account.atresplayer.com/auth/v1/login', None, + 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'username': username, 'password': password, - }))['targetUrl'] + })) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError('Invalid username and/or password', expected=True) raise - self._request_webpage(target_url, None, 'Following Target URL') - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() + metadata_url = self._download_json( + self._API_BASE + 'client/v1/url', video_id, 'Downloading API endpoint data', + query={'href': urllib.parse.urlparse(url).path})['href'] + metadata = self._download_json(metadata_url, video_id) + try: - episode = self._download_json( - self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + video_data = self._download_json(metadata['urlVideo'], video_id, 'Downloading video data') except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 403: error = self._parse_json(e.cause.response.read(), None) @@ -67,37 +108,45 @@ def _real_extract(self, url): raise ExtractorError(error['error_description'], expected=True) raise - title = episode['titulo'] - formats = [] subtitles = {} - for source in episode.get('sources', []): - src = source.get('src') - if not src: - continue + for source in traverse_obj(video_data, ('sources', lambda _, v: url_or_none(v['src']))): + src_url = source['src'] src_type = source.get('type') - if src_type == 'application/vnd.apple.mpegurl': - formats, subtitles = self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - elif src_type == 'application/dash+xml': - formats, subtitles = self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False) - - heartbeat = episode.get('heartbeat') or {} - omniture = episode.get('omniture') or {} - get_meta = lambda x: heartbeat.get(x) or omniture.get(x) + if src_type in ('application/vnd.apple.mpegurl', 'application/hls+legacy', 'application/hls+hevc'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif src_type in ('application/dash+xml', 'application/dash+hevc'): + fmts, subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + else: + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'display_id': display_id, 'id': video_id, - 'title': title, - 'description': episode.get('descripcion'), - 'thumbnail': episode.get('imgPoster'), - 'duration': int_or_none(episode.get('duration')), 'formats': formats, - 'channel': get_meta('channel'), - 'season': get_meta('season'), - 'episode_number': int_or_none(get_meta('episodeNumber')), 'subtitles': subtitles, + **traverse_obj(video_data, { + 'title': ('titulo', {str}), + 'description': ('descripcion', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('imgPoster', {url_or_none}, {lambda v: f'{v}1920x1080.jpg'}), + 'age_limit': ('ageRating', {parse_age_limit}), + }), + **traverse_obj(metadata, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'title', {str}), + 'age_limit': ('ageRating', {parse_age_limit}), + 'series': ('format', 'title', {str}), + 'season': ('currentSeason', 'title', {str}), + 'season_number': ('currentSeason', 'seasonNumber', {int_or_none}), + 'episode_number': ('numberOfEpisode', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none(scale=1000)}), + 'channel': ('channel', 'title', {str}), + }), } diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 96f25c22a..aa39bf382 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -353,7 +353,7 @@ def extract_format(page, version): class CDAFolderIE(InfoExtractor): _MAX_PAGE_SIZE = 36 - _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P\w+)/folder/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P[\w-]+)/folder/(?P\d+)' _TESTS = [ { 'url': 'https://www.cda.pl/domino264/folder/31188385', @@ -378,6 +378,9 @@ class CDAFolderIE(InfoExtractor): 'title': 'TESTY KOSMETYKÓW', }, 'playlist_mincount': 139, + }, { + 'url': 'https://www.cda.pl/FILMY-SERIALE-ANIME-KRESKOWKI-BAJKI/folder/18493422', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index c8c8ae52a..d25f0fe6a 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -82,7 +82,10 @@ def _get_video_id(self, video_data, course_slug, video_slug): class LinkedInIE(LinkedInBaseIE): - _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P\d+)-\w{4}/?(?:[?#]|$)' + _VALID_URL = [ + r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P\d+)-\w{4}/?(?:[?#]|$)', + r'https?://(?:www\.)?linkedin\.com/feed/update/urn:li:activity:(?P\d+)', + ] _TESTS = [{ 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', 'info_dict': { @@ -106,6 +109,9 @@ class LinkedInIE(LinkedInBaseIE): 'like_count': int, 'subtitles': 'mincount:1', }, + }, { + 'url': 'https://www.linkedin.com/feed/update/urn:li:activity:7016901149999955968/?utm_source=share&utm_medium=member_desktop', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/loco.py b/yt_dlp/extractor/loco.py index a648f7e13..6c9a25567 100644 --- a/yt_dlp/extractor/loco.py +++ b/yt_dlp/extractor/loco.py @@ -1,5 +1,9 @@ +import json +import random +import time + from .common import InfoExtractor -from ..utils import int_or_none, url_or_none +from ..utils import int_or_none, jwt_decode_hs256, try_call, url_or_none from ..utils.traversal import require, traverse_obj @@ -55,13 +59,81 @@ class LocoIE(InfoExtractor): 'upload_date': '20250226', 'modified_date': '20250226', }, + }, { + # Requires video authorization + 'url': 'https://loco.com/stream/ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'md5': '0513edf85c1e65c9521f555f665387d5', + 'info_dict': { + 'id': 'ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'ext': 'mp4', + 'title': 'DUAS CONTAS DESAFIANTE, RUSH TOP 1 NO BRASIL!', + 'description': 'md5:aa77818edd6fe00dd4b6be75cba5f826', + 'uploader_id': '7Y9JNAZC3Q', + 'channel': 'ayellol', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'duration': 1229, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/f5aa678b-6d04-45d9-a89a-859af0a8028f.jpg', + 'tags': ['Gameplay', 'Carry'], + 'series': 'League of Legends', + 'timestamp': 1741182253, + 'upload_date': '20250305', + 'modified_timestamp': 1741182419, + 'modified_date': '20250305', + }, }] + # From _app.js + _CLIENT_ID = 'TlwKp1zmF6eKFpcisn3FyR18WkhcPkZtzwPVEEC3' + _CLIENT_SECRET = 'Kp7tYlUN7LXvtcSpwYvIitgYcLparbtsQSe5AdyyCdiEJBP53Vt9J8eB4AsLdChIpcO2BM19RA3HsGtqDJFjWmwoonvMSG3ZQmnS8x1YIM8yl82xMXZGbE3NKiqmgBVU' + + def _is_jwt_expired(self, token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 + + def _get_access_token(self, video_id): + access_token = try_call(lambda: self._get_cookies('https://loco.com')['access_token'].value) + if access_token and not self._is_jwt_expired(access_token): + return access_token + access_token = traverse_obj(self._download_json( + 'https://api.getloconow.com/v3/user/device_profile/', video_id, + 'Downloading access token', fatal=False, data=json.dumps({ + 'platform': 7, + 'client_id': self._CLIENT_ID, + 'client_secret': self._CLIENT_SECRET, + 'model': 'Mozilla', + 'os_name': 'Win32', + 'os_ver': '5.0 (Windows)', + 'app_ver': '5.0 (Windows)', + }).encode(), headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'DEVICE-ID': ''.join(random.choices('0123456789abcdef', k=32)) + 'live', + 'X-APP-LANG': 'en', + 'X-APP-LOCALE': 'en-US', + 'X-CLIENT-ID': self._CLIENT_ID, + 'X-CLIENT-SECRET': self._CLIENT_SECRET, + 'X-PLATFORM': '7', + }), 'access_token') + if access_token and not self._is_jwt_expired(access_token): + self._set_cookie('.loco.com', 'access_token', access_token) + return access_token + def _real_extract(self, url): video_type, video_id = self._match_valid_url(url).group('type', 'id') webpage = self._download_webpage(url, video_id) stream = traverse_obj(self._search_nextjs_data(webpage, video_id), ( - 'props', 'pageProps', ('liveStreamData', 'stream'), {dict}, any, {require('stream info')})) + 'props', 'pageProps', ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')})) + + if access_token := self._get_access_token(video_id): + self._request_webpage( + 'https://drm.loco.com/v1/streams/playback/', video_id, + 'Downloading video authorization', fatal=False, headers={ + 'authorization': access_token, + }, query={ + 'stream_uid': stream['uid'], + }) return { 'formats': self._extract_m3u8_formats(stream['conf']['hls'], video_id), diff --git a/yt_dlp/extractor/manyvids.py b/yt_dlp/extractor/manyvids.py index 8caa8f87f..1356169bf 100644 --- a/yt_dlp/extractor/manyvids.py +++ b/yt_dlp/extractor/manyvids.py @@ -1,31 +1,38 @@ -import re - from .common import InfoExtractor from ..utils import ( + clean_html, determine_ext, - extract_attributes, int_or_none, - str_to_int, + join_nonempty, + parse_count, + parse_duration, + parse_iso8601, url_or_none, - urlencode_postdata, ) +from ..utils.traversal import traverse_obj class ManyVidsIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P\d+)' _TESTS = [{ # preview video - 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', - 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'url': 'https://www.manyvids.com/Video/530341/mv-tips-tricks', + 'md5': '738dc723f7735ee9602f7ea352a6d058', 'info_dict': { - 'id': '133957', + 'id': '530341-preview', 'ext': 'mp4', - 'title': 'everthing about me (Preview)', - 'uploader': 'ellyxxix', + 'title': 'MV Tips & Tricks (Preview)', + 'description': r're:I will take you on a tour around .{1313}$', + 'thumbnail': r're:https://cdn5\.manyvids\.com/php_uploads/video_images/DestinyDiaz/.+\.jpg', + 'uploader': 'DestinyDiaz', 'view_count': int, 'like_count': int, + 'release_timestamp': 1508419904, + 'tags': ['AdultSchool', 'BBW', 'SFW', 'TeacherFetish'], + 'release_date': '20171019', + 'duration': 3167.0, }, + 'expected_warnings': ['Only extracting preview'], }, { # full video 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', @@ -34,129 +41,68 @@ class ManyVidsIE(InfoExtractor): 'id': '935718', 'ext': 'mp4', 'title': 'MY FACE REVEAL', - 'description': 'md5:ec5901d41808b3746fed90face161612', + 'description': r're:Today is the day!! I am finally taking off my mask .{445}$', + 'thumbnail': r're:https://ods\.manyvids\.com/1001061960/3aa5397f2a723ec4597e344df66ab845/screenshots/.+\.jpg', 'uploader': 'Sarah Calanthe', 'view_count': int, 'like_count': int, + 'release_date': '20181110', + 'tags': ['EyeContact', 'Interviews', 'MaskFetish', 'MouthFetish', 'Redhead'], + 'release_timestamp': 1541851200, + 'duration': 224.0, }, }] + _API_BASE = 'https://www.manyvids.com/bff/store/video' def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_json(f'{self._API_BASE}/{video_id}/private', video_id)['data'] + formats, preview_only = [], True - real_url = f'https://www.manyvids.com/video/{video_id}/gtm.js' - try: - webpage = self._download_webpage(real_url, video_id) - except Exception: - # probably useless fallback - webpage = self._download_webpage(url, video_id) - - info = self._search_regex( - r'''(]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', - webpage, 'meta details', default='') - info = extract_attributes(info) - - player = self._search_regex( - r'''(]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''', - webpage, 'player details', default='') - player = extract_attributes(player) - - video_urls_and_ids = ( - (info.get('data-meta-video'), 'video'), - (player.get('data-video-transcoded'), 'transcoded'), - (player.get('data-video-filepath'), 'filepath'), - (self._og_search_video_url(webpage, secure=False, default=None), 'og_video'), - ) - - def txt_or_none(s, default=None): - return (s.strip() or default) if isinstance(s, str) else default - - uploader = txt_or_none(info.get('data-meta-author')) - - def mung_title(s): - if uploader: - s = re.sub(rf'^\s*{re.escape(uploader)}\s+[|-]', '', s) - return txt_or_none(s) - - title = ( - mung_title(info.get('data-meta-title')) - or self._html_search_regex( - (r']+class=["\']item-title[^>]+>([^<]+)', - r']+class=["\']h2 m-0["\'][^>]*>([^<]+)'), - webpage, 'title', default=None) - or self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True)) - - title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title - - if any(p in webpage for p in ('preview_videos', '_preview.mp4')): - title += ' (Preview)' - - mv_token = self._search_regex( - r'data-mvtoken=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'mv token', default=None, group='value') - - if mv_token: - # Sets some cookies - self._download_webpage( - 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', - video_id, note='Setting format cookies', fatal=False, - data=urlencode_postdata({ - 'mvtoken': mv_token, - 'vid': video_id, - }), headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }) - - formats = [] - for v_url, fmt in video_urls_and_ids: - v_url = url_or_none(v_url) - if not v_url: + for format_id, path in [ + ('preview', ['teaser', 'filepath']), + ('transcoded', ['transcodedFilepath']), + ('filepath', ['filepath']), + ]: + format_url = traverse_obj(video_data, (*path, {url_or_none})) + if not format_url: continue - if determine_ext(v_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls')) + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id=format_id)) else: formats.append({ - 'url': v_url, - 'format_id': fmt, + 'url': format_url, + 'format_id': format_id, + 'preference': -10 if format_id == 'preview' else None, + 'quality': 10 if format_id == 'filepath' else None, + 'height': int_or_none( + self._search_regex(r'_(\d{2,3}[02468])_', format_url, 'height', default=None)), }) + if format_id != 'preview': + preview_only = False - self._remove_duplicate_formats(formats) + metadata = traverse_obj( + self._download_json(f'{self._API_BASE}/{video_id}', video_id, fatal=False), 'data') + title = traverse_obj(metadata, ('title', {clean_html})) - for f in formats: - if f.get('height') is None: - f['height'] = int_or_none( - self._search_regex(r'_(\d{2,3}[02468])_', f['url'], 'video height', default=None)) - if '/preview/' in f['url']: - f['format_id'] = '_'.join(filter(None, (f.get('format_id'), 'preview'))) - f['preference'] = -10 - if 'transcoded' in f['format_id']: - f['preference'] = f.get('preference', -1) - 1 - - def get_likes(): - likes = self._search_regex( - rf'''(]*\bdata-id\s*=\s*(['"]){video_id}\2[^>]*>)''', - webpage, 'likes', default='') - likes = extract_attributes(likes) - return int_or_none(likes.get('data-likes')) - - def get_views(): - return str_to_int(self._html_search_regex( - r'''(?s)]*\bclass\s*=["']views-wrapper\b[^>]+>.+?]+>\s*(\d[\d,.]*)\s*''', - webpage, 'view count', default=None)) + if preview_only: + title = join_nonempty(title, '(Preview)', delim=' ') + video_id += '-preview' + self.report_warning( + f'Only extracting preview. Video may be paid or subscription only. {self._login_hint()}') return { 'id': video_id, 'title': title, 'formats': formats, - 'description': txt_or_none(info.get('data-meta-description')), - 'uploader': txt_or_none(info.get('data-meta-author')), - 'thumbnail': ( - url_or_none(info.get('data-meta-image')) - or url_or_none(player.get('data-video-screenshot'))), - 'view_count': get_views(), - 'like_count': get_likes(), + **traverse_obj(metadata, { + 'description': ('description', {clean_html}), + 'uploader': ('model', 'displayName', {clean_html}), + 'thumbnail': (('screenshot', 'thumbnail'), {url_or_none}, any), + 'view_count': ('views', {parse_count}), + 'like_count': ('likes', {parse_count}), + 'release_timestamp': ('launchDate', {parse_iso8601}), + 'duration': ('videoDuration', {parse_duration}), + 'tags': ('tagList', ..., 'label', {str}, filter, all, filter), + }), } diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 91f105519..9f307a53e 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -14,8 +14,9 @@ int_or_none, parse_qs, srt_subtitles_timecode, - traverse_obj, + url_or_none, ) +from ..utils.traversal import traverse_obj class PanoptoBaseIE(InfoExtractor): @@ -345,21 +346,16 @@ def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs subtitles = {} for stream in streams or []: stream_formats = [] - http_stream_url = stream.get('StreamHttpUrl') - stream_url = stream.get('StreamUrl') - - if http_stream_url: - stream_formats.append({'url': http_stream_url}) - - if stream_url: + for stream_url in set(traverse_obj(stream, (('StreamHttpUrl', 'StreamUrl'), {url_or_none}))): media_type = stream.get('ViewerMediaFileTypeName') if media_type in ('hls', ): - m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id) - stream_formats.extend(m3u8_formats) - subtitles = self._merge_subtitles(subtitles, stream_subtitles) + fmts, subs = self._extract_m3u8_formats_and_subtitles(stream_url, video_id, m3u8_id='hls', fatal=False) + stream_formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: stream_formats.append({ 'url': stream_url, + 'ext': media_type, }) for fmt in stream_formats: fmt.update({ diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index 7e0b666ab..2812d9305 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -1,35 +1,142 @@ import base64 import io import struct +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, determine_ext, float_or_none, + make_archive_id, + parse_iso8601, qualities, - remove_end, - remove_start, - try_get, + url_or_none, ) +from ..utils.traversal import subs_list_to_dict, traverse_obj -class RTVEALaCartaIE(InfoExtractor): +class RTVEBaseIE(InfoExtractor): + # Reimplementation of https://js2.rtve.es/pages/app-player/3.5.1/js/pf_video.js + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) + while True: + length_data = encrypted_data.read(4) + length = struct.unpack('!I', length_data)[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + data = bytes(filter(None, data)) + alphabet_data, _, url_data = data.partition(b'#') + quality_str, _, url_data = url_data.rpartition(b'%%') + quality_str = quality_str.decode() or '' + alphabet = RTVEBaseIE._get_alphabet(alphabet_data) + url = RTVEBaseIE._get_url(alphabet, url_data) + yield quality_str, url + encrypted_data.read(4) # CRC + + @staticmethod + def _get_url(alphabet, url_data): + url = '' + f = 0 + e = 3 + b = 1 + for char in url_data.decode('iso-8859-1'): + if f == 0: + l = int(char) * 10 + f = 1 + else: + if e == 0: + l += int(char) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + return url + + @staticmethod + def _get_alphabet(alphabet_data): + alphabet = [] + e = 0 + d = 0 + for char in alphabet_data.decode('iso-8859-1'): + if d == 0: + alphabet.append(char) + d = e = (e + 1) % 4 + else: + d -= 1 + return alphabet + + def _extract_png_formats_and_subtitles(self, video_id, media_type='videos'): + formats, subtitles = [], {} + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + for manager in ('rtveplayw', 'default'): + png = self._download_webpage( + f'http://www.rtve.es/ztnr/movil/thumbnail/{manager}/{media_type}/{video_id}.png', + video_id, 'Downloading url information', query={'q': 'v2'}, fatal=False) + if not png: + continue + + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, 'dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + return formats, subtitles + + def _parse_metadata(self, metadata): + return traverse_obj(metadata, { + 'title': ('title', {str.strip}), + 'alt_title': ('alt', {str.strip}), + 'description': ('description', {clean_html}), + 'timestamp': ('dateOfEmission', {parse_iso8601(delimiter=' ')}), + 'release_timestamp': ('publicationDate', {parse_iso8601(delimiter=' ')}), + 'modified_timestamp': ('modificationDate', {parse_iso8601(delimiter=' ')}), + 'thumbnail': (('thumbnail', 'image', 'imageSEO'), {url_or_none}, any), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'is_live': ('live', {bool}), + 'series': (('programTitle', ('programInfo', 'title')), {clean_html}, any), + }) + + +class RTVEALaCartaIE(RTVEBaseIE): IE_NAME = 'rtve.es:alacarta' - IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P\d+)' + IE_DESC = 'RTVE a la carta and Play' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/(?:m/)?(?:(?:alacarta|play)/videos|filmoteca)/(?!directo)(?:[^/?#]+/){2}(?P\d+)', + r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/?#]+/video/[^/?#]+/(?P\d+)', + ] _TESTS = [{ - 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'url': 'http://www.rtve.es/alacarta/videos/la-aventura-del-saber/aventuraentornosilla/3088905/', + 'md5': 'a964547824359a5753aef09d79fe984b', 'info_dict': { - 'id': '2491869', + 'id': '3088905', 'ext': 'mp4', - 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', - 'duration': 5024.566, - 'series': 'Balonmano', + 'title': 'En torno a la silla', + 'duration': 1216.981, + 'series': 'La aventura del Saber', + 'thumbnail': 'https://img2.rtve.es/v/aventuraentornosilla_3088905.png', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', @@ -38,140 +145,88 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'live_status': 'is_live', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', }, 'params': { 'skip_download': 'live stream', }, }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'd850f3c8731ea53952ebab489cf81cbf', + 'md5': 'f3cf0d1902d008c48c793e736706c174', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104', - 'duration': 3222.0, + 'title': 'Episodio 104', + 'duration': 3222.8, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'Servir y proteger', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, }, { 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', 'only_matching': True, + }, { + 'url': 'https://www.rtve.es/play/videos/saber-vivir/07-07-24/16177116/', + 'md5': 'a5b24fcdfa3ff5cb7908aba53d22d4b6', + 'info_dict': { + 'id': '16177116', + 'ext': 'mp4', + 'title': 'Saber vivir - 07/07/24', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 2162.68, + 'series': 'Saber vivir', + }, + }, { + 'url': 'https://www.rtve.es/infantil/serie/agus-lui-churros-crafts/video/gusano/7048976/', + 'info_dict': { + 'id': '7048976', + 'ext': 'mp4', + 'title': 'Gusano', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 292.86, + 'series': 'Agus & Lui: Churros y Crafts', + '_old_archive_ids': ['rtveinfantil 7048976'], + }, }] - def _real_initialize(self): - user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode()).decode('utf-8') - self._manager = self._download_json( - 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info')['manager'] - - @staticmethod - def _decrypt_url(png): - encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) - while True: - length = struct.unpack('!I', encrypted_data.read(4))[0] - chunk_type = encrypted_data.read(4) - if chunk_type == b'IEND': - break - data = encrypted_data.read(length) - if chunk_type == b'tEXt': - alphabet_data, text = data.split(b'\0') - quality, url_data = text.split(b'%%') - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data.decode('iso-8859-1'): - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data.decode('iso-8859-1'): - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - yield quality.decode(), url - encrypted_data.read(4) # CRC - - def _extract_png_formats(self, video_id): - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/videos/{video_id}.png', - video_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, video_url in self._decrypt_url(png): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': video_url, - }) - return formats + def _get_subtitles(self, video_id): + subtitle_data = self._download_json( + f'https://api2.rtve.es/api/videos/{video_id}/subtitulos.json', video_id, + 'Downloading subtitles info') + return traverse_obj(subtitle_data, ('page', 'items', ..., { + 'id': ('lang', {str}), + 'url': ('src', {url_or_none}), + }, all, {subs_list_to_dict(lang='es')})) def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_json( + metadata = self._download_json( f'http://www.rtve.es/api/videos/{video_id}/config/alacarta_videos.json', video_id)['page']['items'][0] - if info['state'] == 'DESPU': + if metadata['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'].strip() - formats = self._extract_png_formats(video_id) + formats, subtitles = self._extract_png_formats_and_subtitles(video_id) - subtitles = None - sbt_file = info.get('sbtFile') - if sbt_file: - subtitles = self.extract_subtitles(video_id, sbt_file) + self._merge_subtitles(self.extract_subtitles(video_id), target=subtitles) - is_live = info.get('live') is True + is_infantil = urllib.parse.urlparse(url).path.startswith('/infantil/') return { 'id': video_id, - 'title': title, 'formats': formats, - 'thumbnail': info.get('image'), 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), 1000), - 'is_live': is_live, - 'series': info.get('programTitle'), + **self._parse_metadata(metadata), + '_old_archive_ids': [make_archive_id('rtveinfantil', video_id)] if is_infantil else None, } - def _get_subtitles(self, video_id, sub_file): - subs = self._download_json( - sub_file + '.json', video_id, - 'Downloading subtitles info')['page']['items'] - return dict( - (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) - for s in subs) - -class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVEAudioIE(RTVEBaseIE): IE_NAME = 'rtve.es:audio' IE_DESC = 'RTVE audio' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/(?:[^/?#]+/){2}(?P\d+)' _TESTS = [{ 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', @@ -180,9 +235,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5889192', 'ext': 'mp3', 'title': 'Códigos informáticos', - 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'alt_title': 'Códigos informáticos - Escuchar ahora', 'duration': 349.440, 'series': 'A hombros de gigantes', + 'description': 'md5:72b0d7c1ca20fd327bdfff7ac0171afb', + 'thumbnail': 'https://img2.rtve.es/a/palabra-ingeniero-codigos-informaticos-270421_5889192.png', }, }, { 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', @@ -191,9 +248,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5791165', 'ext': 'mp3', 'title': 'Ignatius Farray', + 'alt_title': 'En Radio 3 - Ignatius Farray - 13/02/21 - escuchar ahora', 'thumbnail': r're:https?://.+/1613243011863.jpg', 'duration': 3559.559, 'series': 'En Radio 3', + 'description': 'md5:124aa60b461e0b1724a380bad3bc4040', }, }, { 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', @@ -202,126 +261,101 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '6082623', 'ext': 'mp3', 'title': 'Capítulo 26 y último: La muerte de Victor', + 'alt_title': 'Frankenstein o el moderno Prometeo - Capítulo 26 y último: La muerte de Victor', 'thumbnail': r're:https?://.+/1632147445707.jpg', 'duration': 3174.086, 'series': 'Frankenstein o el moderno Prometeo', + 'description': 'md5:4ee6fcb82ebe2e46d267e1d1c1a8f7b5', }, }] - def _extract_png_formats(self, audio_id): - """ - This function retrieves media related png thumbnail which obfuscate - valuable information about the media. This information is decrypted - via base class _decrypt_url function providing media quality and - media url - """ - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/audios/{audio_id}.png', - audio_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, audio_url in self._decrypt_url(png): - ext = determine_ext(audio_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - audio_url, audio_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - audio_url, audio_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': audio_url, - }) - return formats - def _real_extract(self, url): audio_id = self._match_id(url) - info = self._download_json( - f'https://www.rtve.es/api/audios/{audio_id}.json', - audio_id)['page']['items'][0] + metadata = self._download_json( + f'https://www.rtve.es/api/audios/{audio_id}.json', audio_id)['page']['items'][0] + + formats, subtitles = self._extract_png_formats_and_subtitles(audio_id, media_type='audios') return { 'id': audio_id, - 'title': info['title'].strip(), - 'thumbnail': info.get('thumbnail'), - 'duration': float_or_none(info.get('duration'), 1000), - 'series': try_get(info, lambda x: x['programInfo']['title']), - 'formats': self._extract_png_formats(audio_id), + 'formats': formats, + 'subtitles': subtitles, + **self._parse_metadata(metadata), } -class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'rtve.es:infantil' - IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' - - _TESTS = [{ - 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '5747454717aedf9f9fdf212d1bcfc48d', - 'info_dict': { - 'id': '3040283', - 'ext': 'mp4', - 'title': 'Maneras de vivir', - 'thumbnail': r're:https?://.+/1426182947956\.JPG', - 'duration': 357.958, - }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], - }] - - -class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVELiveIE(RTVEBaseIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)', + r'https?://(?:www\.)?rtve\.es/play/videos/directo/[^/?#]+/(?P[a-zA-Z0-9-]+)', + ] _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, - 'params': { - 'skip_download': 'live stream', + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'https://www.rtve.es/play/videos/directo/deportes/tdp/', + 'info_dict': { + 'id': 'tdp', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img2\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'http://www.rtve.es/play/videos/directo/canales-lineales/la-1/', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') - title = remove_start(title, 'Estoy viendo ') - vidplayer_id = self._search_regex( - (r'playerId=player([0-9]+)', - r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', - r'data-id=["\'](\d+)'), - webpage, 'internal video ID') + data_setup = self._search_json( + r']+class="[^"]*videoPlayer[^"]*"[^>]*data-setup=\'', + webpage, 'data_setup', video_id) + + formats, subtitles = self._extract_png_formats_and_subtitles(data_setup['idAsset']) return { 'id': video_id, - 'title': title, - 'formats': self._extract_png_formats(vidplayer_id), + **self._search_json_ld(webpage, video_id, fatal=False), + 'title': self._html_extract_title(webpage), + 'formats': formats, + 'subtitles': subtitles, 'is_live': True, } class RTVETelevisionIE(InfoExtractor): IE_NAME = 'rtve.es:television' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P\d+).shtml' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/?#]+/[^/?#]+/(?P\d+).shtml' _TEST = { - 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'url': 'https://www.rtve.es/television/20091103/video-inedito-del-8o-programa/299020.shtml', 'info_dict': { - 'id': '3069778', + 'id': '572515', 'ext': 'mp4', - 'title': 'Documentos TV - La revolución del móvil', - 'duration': 3496.948, + 'title': 'Clase inédita', + 'duration': 335.817, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'El coro de la cárcel', }, 'params': { 'skip_download': True, @@ -332,11 +366,8 @@ def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - alacarta_url = self._search_regex( - r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', - webpage, 'alacarta url', default=None) - if alacarta_url is None: - raise ExtractorError( - 'The webpage doesn\'t contain any video', expected=True) + play_url = self._html_search_meta('contentUrl', webpage) + if play_url is None: + raise ExtractorError('The webpage doesn\'t contain any video', expected=True) - return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) + return self.url_result(play_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index da3082907..416cbab3c 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -513,7 +513,7 @@ def _parse_video(self, video, with_url=True): class TVPVODVideoIE(TVPVODBaseIE): IE_NAME = 'tvp:vod' - _VALID_URL = r'https?://vod\.tvp\.pl/(?P[a-z\d-]+,\d+)/[a-z\d-]+(?\d+)/?(?:[?#]|$)' + _VALID_URL = r'https?://vod\.tvp\.pl/(?P[a-z\d-]+,\d+)/[a-z\d-]+(?\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', @@ -568,6 +568,9 @@ class TVPVODVideoIE(TVPVODBaseIE): 'live_status': 'is_live', 'thumbnail': 're:https?://.+', }, + }, { + 'url': 'https://vod.tvp.pl/informacje-i-publicystyka,205/konskie-2025-debata-przedwyborcza-odcinki,2028435/odcinek--1,S01E-1,2028419', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py index 6f924d7b1..89556cf98 100644 --- a/yt_dlp/extractor/tvw.py +++ b/yt_dlp/extractor/tvw.py @@ -1,21 +1,13 @@ import json from .common import InfoExtractor -from ..utils import clean_html, extract_attributes, remove_end, unified_timestamp, url_or_none -from ..utils.traversal import find_elements, traverse_obj +from ..utils import clean_html, extract_attributes, parse_qs, remove_end, require, unified_timestamp, url_or_none +from ..utils.traversal import find_element, find_elements, traverse_obj -class TvwBaseIE(InfoExtractor): - def _get_title(self, webpage): - return remove_end(self._og_search_title(webpage, default=None), ' - TVW') - - def _get_description(self, webpage): - return self._og_search_description(webpage, default=None) - - -class TvwIE(TvwBaseIE): +class TvwIE(InfoExtractor): + IE_NAME = 'tvw' _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P[^/?#]+)' - _TESTS = [{ 'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/', 'md5': '9ceb94fe2bb7fd726f74f16356825703', @@ -111,8 +103,8 @@ def _real_extract(self, url): 'display_id': display_id, 'formats': formats, 'subtitles': subtitles, - 'title': self._get_title(webpage), - 'description': self._get_description(webpage), + 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + 'description': self._og_search_description(webpage, default=None), **traverse_obj(video_data, { 'title': ('title', {str}), 'description': ('description', {clean_html}), @@ -125,7 +117,7 @@ def _real_extract(self, url): } -class TvwNewsIE(TvwBaseIE): +class TvwNewsIE(InfoExtractor): IE_NAME = 'Tvw:News' _VALID_URL = r'https?://(?:www\.)?tvw\.org/(\d{4})/(0[1-9]|1[0-2])/(?P[^/?#]+)' _TESTS = [{ @@ -163,4 +155,44 @@ def _real_extract(self, url): return self.playlist_result( (self.url_result(f'https://tvw.org/watch?eventID={video_id}') for video_id in video_ids), playlist_id, - playlist_title=self._get_title(webpage), playlist_description=self._get_description(webpage)) + playlist_title=remove_end(self._og_search_title(webpage, default=None), ' - TVW'), playlist_description=self._og_search_description(webpage, default=None)) + + +class TvwTvChannelsIE(InfoExtractor): + IE_NAME = 'tvw:tvchannels' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/tvchannels/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://tvw.org/tvchannels/air/', + 'info_dict': { + 'id': 'air', + 'ext': 'mp4', + 'title': r're:TVW Cable Channel Live Stream', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://tvw.org/tvchannels/tvw2/', + 'info_dict': { + 'id': 'tvw2', + 'ext': 'mp4', + 'title': r're:TVW-2 Broadcast Channel', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = traverse_obj(webpage, ( + {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes}, + 'src', {parse_qs}, 'encoder', 0, {json.loads}, 'live247URI', {url_or_none}, {require('stream url')})) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True), + 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'is_live': True, + } diff --git a/yt_dlp/extractor/youtube/_tab.py b/yt_dlp/extractor/youtube/_tab.py index 122300e60..c018ee8cf 100644 --- a/yt_dlp/extractor/youtube/_tab.py +++ b/yt_dlp/extractor/youtube/_tab.py @@ -524,10 +524,16 @@ def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): response = self._extract_response( item_id=f'{item_id} page {page_num}', query=continuation, headers=headers, ytcfg=ytcfg, - check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')) + check_get_keys=( + 'continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints', + # Playlist recommendations may return with no data - ignore + ('responseContext', 'serviceTrackingParams', ..., 'params', ..., lambda k, v: k == 'key' and v == 'GetRecommendedMusicPlaylists_rid'), + )) if not response: break + + continuation = None # Extracting updated visitor data is required to prevent an infinite extraction loop in some cases # See: https://github.com/ytdl-org/youtube-dl/issues/28702 visitor_data = self._extract_visitor_data(response) or visitor_data @@ -564,7 +570,13 @@ def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): yield from func(video_items_renderer) continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) - if not video_items_renderer: + # In the case only a continuation is returned, try to follow it. + # We extract this after trying to extract non-continuation items as otherwise this + # may be prioritized over other continuations. + # see: https://github.com/yt-dlp/yt-dlp/issues/12933 + continuation = continuation or self._extract_continuation({'contents': [continuation_item]}) + + if not continuation and not video_items_renderer: break @staticmethod @@ -999,14 +1011,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', + 'uploader': 'Igor Kleiner ', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'channel': 'Igor Kleiner Ph.D.', + 'channel': 'Igor Kleiner ', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], + 'tags': 'count:23', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int, }, @@ -1016,18 +1028,19 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', + 'uploader': 'Igor Kleiner ', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], + 'tags': 'count:23', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'channel': 'Igor Kleiner Ph.D.', + 'channel': 'Igor Kleiner ', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int, }, }, { + # TODO: fix channel_is_verified extraction 'note': 'playlists, series', 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', 'playlist_mincount': 5, @@ -1066,22 +1079,23 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'basic, single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', 'info_dict': { - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', + 'id': 'PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', + 'title': 'single video playlist', 'description': '', 'tags': [], 'view_count': int, - 'modified_date': '20201130', - 'channel': 'Sergey M.', - 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'availability': 'public', - 'uploader': 'Sergey M.', - 'uploader_url': 'https://www.youtube.com/@sergeym.6173', - 'uploader_id': '@sergeym.6173', + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', }, 'playlist_count': 1, }, { @@ -1171,11 +1185,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 17, }, { - 'note': 'Community tab', + 'note': 'Posts tab', 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', + 'title': 'lex will - Posts', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'channel': 'lex will', 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', @@ -1188,30 +1202,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 18, }, { - 'note': 'Channels tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'channel': 'lex will', - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int, - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'uploader_id': '@lexwill718', - 'uploader': 'lex will', - }, - 'playlist_mincount': 12, - }, { + # TODO: fix channel_is_verified extraction 'note': 'Search tab', 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', 'playlist_mincount': 40, 'info_dict': { 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', 'tags': ['Mathematics'], 'channel': '3Blue1Brown', @@ -1232,6 +1230,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { @@ -1294,24 +1293,25 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 21, }, { + # TODO: fix availability extraction 'note': 'Playlist with "show unavailable videos" button', - 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', + 'url': 'https://www.youtube.com/playlist?list=PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', 'info_dict': { - 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', - 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', + 'title': 'The Memes Of 2010s.....', + 'id': 'PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', 'view_count': int, - 'channel': 'Phim Siêu Nhân Nhật Bản', + 'channel': "I'm Not JiNxEd", 'tags': [], - 'description': '', - 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', - 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + 'description': 'md5:44dc3b315ba69394feaafa2f40e7b2a1', + 'channel_url': 'https://www.youtube.com/channel/UC5H5H85D1QE5-fuWWQ1hdNg', + 'channel_id': 'UC5H5H85D1QE5-fuWWQ1hdNg', 'modified_date': r're:\d{8}', 'availability': 'public', - 'uploader_url': 'https://www.youtube.com/@phimsieunhannhatban', - 'uploader_id': '@phimsieunhannhatban', - 'uploader': 'Phim Siêu Nhân Nhật Bản', + 'uploader_url': 'https://www.youtube.com/@imnotjinxed1998', + 'uploader_id': '@imnotjinxed1998', + 'uploader': "I'm Not JiNxEd", }, - 'playlist_mincount': 200, + 'playlist_mincount': 150, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { 'note': 'Playlist with unavailable videos in page 7', @@ -1334,6 +1334,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1000, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { + # TODO: fix availability extraction 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', 'info_dict': { @@ -1384,7 +1385,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'hGkQjiJLjWQ', # This will keep changing + 'id': 'YDvsBbKfLPA', # This will keep changing 'ext': 'mp4', 'title': str, 'upload_date': r're:\d{8}', @@ -1409,6 +1410,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@SkyNews', 'uploader': 'Sky News', 'channel_is_verified': True, + 'media_type': 'livestream', + 'timestamp': int, }, 'params': { 'skip_download': True, @@ -1496,6 +1499,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'VLPL, should redirect to playlist?list=PL...', 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', 'info_dict': { @@ -1537,6 +1541,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) # Treat as a general feed + # TODO: fix extraction 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', 'info_dict': { 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', @@ -1560,21 +1565,21 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'note': 'unlisted single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', 'info_dict': { - 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'title': 'yt-dlp unlisted playlist test', + 'id': 'PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', + 'title': 'unlisted playlist', 'availability': 'unlisted', 'tags': [], - 'modified_date': '20220418', - 'channel': 'colethedj', + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', 'view_count': int, 'description': '', - 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader_url': 'https://www.youtube.com/@colethedj1894', - 'uploader_id': '@colethedj1894', - 'uploader': 'colethedj', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', }, 'playlist': [{ 'info_dict': { @@ -1596,6 +1601,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 1, 'params': {'extract_flat': True}, }, { + # By default, recommended is always empty. 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', 'url': 'https://www.youtube.com/feed/recommended', 'info_dict': { @@ -1603,7 +1609,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'recommended', 'tags': [], }, - 'playlist_mincount': 50, + 'playlist_count': 0, 'params': { 'skip_download': True, 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, @@ -1628,6 +1634,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'skip': 'Query for sorting no longer works', }, { + # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', 'info_dict': { @@ -1654,11 +1661,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', 'only_matching': True, }, { + # TODO: fix metadata extraction 'note': 'collaborative playlist (uploader name in the form "by and x other(s)")', 'url': 'https://www.youtube.com/playlist?list=PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', 'info_dict': { 'id': 'PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', - 'modified_date': '20220407', + 'modified_date': '20250115', 'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', 'tags': [], 'availability': 'unlisted', @@ -1692,6 +1700,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'expected_warnings': ['Preferring "ja"'], }, { # XXX: this should really check flat playlist entries, but the test suite doesn't support that + # TODO: fix availability extraction 'note': 'preferred lang set with playlist with translated video titles', 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', 'info_dict': { @@ -1714,6 +1723,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # shorts audio pivot for 2GtVksBMYFM. 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', + # TODO: fix extraction 'info_dict': { 'id': 'sfv_audio_pivot', 'title': 'sfv_audio_pivot', @@ -1751,6 +1761,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 8, }, { # Should get three playlists for videos, shorts and streams tabs + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', 'info_dict': { 'id': 'UCK9V2B22uJYu3N7eR_BT9QA', @@ -1758,7 +1769,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_follower_count': int, 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'description': 'md5:49809d8bf9da539bc48ed5d1f83c33f2', + 'description': 'md5:01e53f350ab8ad6fcf7c4fedb3c1b99f', 'channel': 'Polka Ch. 尾丸ポルカ', 'tags': 'count:35', 'uploader_url': 'https://www.youtube.com/@OmaruPolka', @@ -1769,14 +1780,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 3, }, { # Shorts tab with channel with handle - # TODO: fix channel description + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@NotJustBikes/shorts', 'info_dict': { 'id': 'UC0intLFzLaudFG-xAvUEO-A', 'title': 'Not Just Bikes - Shorts', 'tags': 'count:10', 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', - 'description': 'md5:5e82545b3a041345927a92d0585df247', + 'description': 'md5:1d9fc1bad7f13a487299d1fe1712e031', 'channel_follower_count': int, 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', 'channel': 'Not Just Bikes', @@ -1797,7 +1808,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', 'channel': '中村悠一', 'channel_follower_count': int, - 'description': 'md5:e744f6c93dafa7a03c0c6deecb157300', + 'description': 'md5:e8fd705073a594f27d6d6d020da560dc', 'uploader_url': 'https://www.youtube.com/@Yuichi-Nakamura', 'uploader_id': '@Yuichi-Nakamura', 'uploader': '中村悠一', @@ -1815,6 +1826,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'only_matching': True, }, { # No videos tab but has a shorts tab + # TODO: fix metadata extraction 'url': 'https://www.youtube.com/c/TKFShorts', 'info_dict': { 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', @@ -1851,6 +1863,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # Shorts url result in shorts tab # TODO: Fix channel id extraction + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', @@ -1879,6 +1892,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'params': {'extract_flat': True}, }, { # Live video status should be extracted + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live', 'info_dict': { 'id': 'UCQvWX73GQygcwXOTSf_VDVg', @@ -1907,6 +1921,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1, }, { # Channel renderer metadata. Contains number of videos on the channel + # TODO: channels tab removed, change this test to use another page with channel renderer 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', @@ -1940,7 +1955,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, }], 'params': {'extract_flat': True}, + 'skip': 'channels tab removed', }, { + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@3blue1brown/about', 'info_dict': { 'id': '@3blue1brown', @@ -1950,7 +1967,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', 'channel': '3Blue1Brown', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', @@ -1976,6 +1993,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 5, }, { # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@AHimitsu/releases', 'info_dict': { 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', @@ -2015,6 +2033,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 100, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { + # TODO: fix channel_is_verified extraction 'note': 'Tags containing spaces', 'url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', 'playlist_count': 3, @@ -2035,6 +2054,24 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'challenges', 'sketches', 'scary games', 'funny games', 'rage games', 'mark fischbach'], }, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/12933 + 'note': 'streams tab, some scheduled streams. Empty intermediate response with only continuation - must follow', + 'url': 'https://www.youtube.com/@sbcitygov/streams', + 'playlist_mincount': 150, + 'info_dict': { + 'id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'channel': 'sbcitygov', + 'channel_id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'title': 'sbcitygov - Live', + 'channel_follower_count': int, + 'description': 'md5:ca1a92059835c071e33b3db52f4a6d67', + 'uploader_id': '@sbcitygov', + 'uploader_url': 'https://www.youtube.com/@sbcitygov', + 'uploader': 'sbcitygov', + 'channel_url': 'https://www.youtube.com/channel/UCH6-qfQwlUgz9SAf05jvc_w', + 'tags': [], + }, }] @classmethod diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 074a2a0d8..bcfe8b152 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -3646,6 +3646,8 @@ def feed_entry(name): if 'sign in' in reason.lower(): reason = remove_end(reason, 'This helps protect our community. Learn more') reason = f'{remove_end(reason.strip(), ".")}. {self._youtube_login_hint}' + elif get_first(playability_statuses, ('errorScreen', 'playerCaptchaViewModel', {dict})): + reason += '. YouTube is requiring a captcha challenge before playback' self.raise_no_formats(reason, expected=True) keywords = get_first(video_details, 'keywords', expected_type=list) or [] @@ -3874,7 +3876,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if not traverse_obj(initial_data, 'contents'): self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') initial_data = None - if not initial_data: + if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): query = {'videoId': video_id} query.update(self._get_checkok_params()) initial_data = self._extract_response( diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 1eaa0ee5f..39158a8cc 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -3,6 +3,7 @@ from .common import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index ddceaa9a9..e33769422 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -505,6 +505,7 @@ def copy(self): HEADRequest = functools.partial(Request, method='HEAD') +PATCHRequest = functools.partial(Request, method='PATCH') PUTRequest = functools.partial(Request, method='PUT')