diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9c9ee64a8..68074a562 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -52,7 +52,7 @@ SSLError, network_exceptions, ) -from .networking.impersonate import ImpersonateRequestHandler +from .networking.impersonate import ImpersonateRequestHandler, ImpersonateTarget from .plugins import directories as plugin_directories, load_all_plugins from .postprocessor import ( EmbedThumbnailPP, @@ -3231,6 +3231,16 @@ def dl(self, name, info, subtitle=False, test=False): } else: params = self.params + + impersonate = info.pop('impersonate', None) + # Do not override --impersonate with extractor-specified impersonation + if params.get('impersonate') is None: + available_target, requested_targets = self._parse_impersonate_targets(impersonate) + if available_target: + info['impersonate'] = available_target + elif requested_targets: + self.report_warning(self._unavailable_targets_message(requested_targets), only_once=True) + fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params) if not test: for ph in self._progress_hooks: @@ -4183,6 +4193,31 @@ def _impersonate_target_available(self, target): for rh in self._request_director.handlers.values() if isinstance(rh, ImpersonateRequestHandler)) + def _parse_impersonate_targets(self, impersonate): + if impersonate in (True, ''): + impersonate = ImpersonateTarget() + + requested_targets = [ + t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t) + for t in variadic(impersonate) + ] if impersonate else [] + + available_target = next(filter(self._impersonate_target_available, requested_targets), None) + + return available_target, requested_targets + + @staticmethod + def _unavailable_targets_message(requested_targets, note=None, is_error=False): + note = note or 'The extractor specified to use impersonation for this download' + specific_targets = ', '.join(filter(None, map(str, requested_targets))) + message = ( + 'no impersonate target is available' if not specific_targets + else f'none of these impersonate targets are available: {specific_targets}') + return ( + f'{note}, but {message}. {"See" if is_error else "If you encounter errors, then see"}' + f' https://github.com/yt-dlp/yt-dlp#impersonation ' + f'for information on installing the required dependencies') + def urlopen(self, req): """ Start an HTTP download """ if isinstance(req, str): diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 9c34bd289..17458b9b9 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -99,7 +99,7 @@ def _get_suitable_downloader(info_dict, protocol, params, default): if external_downloader is None: if info_dict['to_stdout'] and FFmpegFD.can_merge_formats(info_dict, params): return FFmpegFD - elif external_downloader.lower() != 'native': + elif external_downloader.lower() != 'native' and info_dict.get('impersonate') is None: ed = get_external_downloader(external_downloader) if ed.can_download(info_dict, external_downloader): return ed diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 90bfcaf55..073860f6f 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -27,6 +27,9 @@ class HttpFD(FileDownloader): def real_download(self, filename, info_dict): url = info_dict['url'] request_data = info_dict.get('request_data', None) + request_extensions = {} + if info_dict.get('impersonate') is not None: + request_extensions['impersonate'] = info_dict['impersonate'] class DownloadContext(dict): __getattr__ = dict.get @@ -109,7 +112,7 @@ def establish_connection(): if try_call(lambda: range_end >= ctx.content_len): range_end = ctx.content_len - 1 - request = Request(url, request_data, headers) + request = Request(url, request_data, headers, extensions=request_extensions) has_range = range_start is not None if has_range: request.headers['Range'] = f'bytes={int(range_start)}-{int_or_none(range_end) or ""}' diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 939c2800e..d07d6e48b 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -7,6 +7,7 @@ from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, + clean_html, extract_attributes, float_or_none, int_or_none, @@ -19,7 +20,7 @@ url_or_none, urljoin, ) -from ..utils.traversal import find_element, traverse_obj +from ..utils.traversal import find_element, find_elements, traverse_obj class BandcampIE(InfoExtractor): @@ -70,6 +71,9 @@ class BandcampIE(InfoExtractor): 'album': 'FTL: Advanced Edition Soundtrack', 'uploader_url': 'https://benprunty.bandcamp.com', 'uploader_id': 'benprunty', + 'tags': ['soundtrack', 'chiptunes', 'cinematic', 'electronic', 'video game music', 'California'], + 'artists': ['Ben Prunty'], + 'album_artists': ['Ben Prunty'], }, }, { # no free download, mp3 128 @@ -94,6 +98,9 @@ class BandcampIE(InfoExtractor): 'album': 'Call of the Mastodon', 'uploader_url': 'https://relapsealumni.bandcamp.com', 'uploader_id': 'relapsealumni', + 'tags': ['Philadelphia'], + 'artists': ['Mastodon'], + 'album_artists': ['Mastodon'], }, }, { # track from compilation album (artist/album_artist difference) @@ -118,6 +125,9 @@ class BandcampIE(InfoExtractor): 'album': 'DSK F/W 2016-2017 Free Compilation', 'uploader_url': 'https://diskotopia.bandcamp.com', 'uploader_id': 'diskotopia', + 'tags': ['Japan'], + 'artists': ['submerse'], + 'album_artists': ['Diskotopia'], }, }] @@ -252,6 +262,7 @@ def _real_extract(self, url): 'album': embed.get('album_title'), 'album_artist': album_artist, 'formats': formats, + 'tags': traverse_obj(webpage, ({find_elements(cls='tag')}, ..., {clean_html})), } diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d601e1751..8a914abf0 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -38,7 +38,6 @@ TransportError, network_exceptions, ) -from ..networking.impersonate import ImpersonateTarget from ..utils import ( IDENTITY, JSON_LD_RE, @@ -259,6 +258,11 @@ class InfoExtractor: * key The key (as hex) used to decrypt fragments. If `key` is given, any key URI will be ignored * iv The IV (as hex) used to decrypt fragments + * impersonate Impersonate target(s). Can be any of the following entities: + * an instance of yt_dlp.networking.impersonate.ImpersonateTarget + * a string in the format of CLIENT[:OS] + * a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances + * a boolean value; True means any impersonate target is sufficient * downloader_options A dictionary of downloader options (For internal use only) * http_chunk_size Chunk size for HTTP downloads @@ -336,6 +340,7 @@ class InfoExtractor: * "name": Name or description of the subtitles * "http_headers": A dictionary of additional HTTP headers to add to the request. + * "impersonate": Impersonate target(s); same as the "formats" field "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles'; contains automatically generated captions instead of normal subtitles @@ -884,26 +889,17 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa extensions = {} - if impersonate in (True, ''): - impersonate = ImpersonateTarget() - requested_targets = [ - t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t) - for t in variadic(impersonate) - ] if impersonate else [] - - available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None) + available_target, requested_targets = self._downloader._parse_impersonate_targets(impersonate) if available_target: extensions['impersonate'] = available_target elif requested_targets: - message = 'The extractor is attempting impersonation, but ' - message += ( - 'no impersonate target is available' if not str(impersonate) - else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"') - info_msg = ('see https://github.com/yt-dlp/yt-dlp#impersonation ' - 'for information on installing the required dependencies') + msg = 'The extractor is attempting impersonation' if require_impersonation: - raise ExtractorError(f'{message}; {info_msg}', expected=True) - self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True) + raise ExtractorError( + self._downloader._unavailable_targets_message(requested_targets, note=msg, is_error=True), + expected=True) + self.report_warning( + self._downloader._unavailable_targets_message(requested_targets, note=msg), only_once=True) try: return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions)) diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 562b93fc7..b2b35a712 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -457,12 +457,9 @@ def _extract_formats_and_subtitles(self, broadcast, video_id): self.report_warning(f'No formats available for {format_id} broadcast; skipping') return [], {} - cdn_headers = {'x-cdn-token': token} fmts, subs = self._extract_m3u8_formats_and_subtitles( - m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4', - m3u8_id=format_id, fatal=False, headers=cdn_headers) + m3u8_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) for fmt in fmts: - fmt['http_headers'] = cdn_headers fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' ')) fmt.setdefault('language', language) if fmt.get('vcodec') == 'none' and fmt['language'] == 'en': diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index 825da6516..dd4ea5658 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -7,11 +7,11 @@ class TenPlayIE(InfoExtractor): IE_NAME = '10play' - _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/?#]+/)+(?Ptpv\d{6}[a-z]{5})' + _VALID_URL = r'https?://(?:www\.)?10(?:play)?\.com\.au/(?:[^/?#]+/)+(?Ptpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ # Geo-restricted to Australia - 'url': 'https://10play.com.au/australian-survivor/web-extras/season-10-brains-v-brawn-ii/myless-journey/tpv250414jdmtf', + 'url': 'https://10.com.au/australian-survivor/web-extras/season-10-brains-v-brawn-ii/myless-journey/tpv250414jdmtf', 'info_dict': { 'id': '7440980000013868', 'ext': 'mp4', @@ -32,7 +32,7 @@ class TenPlayIE(InfoExtractor): 'params': {'skip_download': 'm3u8'}, }, { # Geo-restricted to Australia - 'url': 'https://10play.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp', + 'url': 'https://10.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp', 'info_dict': { 'id': '9000000000091177', 'ext': 'mp4', @@ -55,7 +55,7 @@ class TenPlayIE(InfoExtractor): 'params': {'skip_download': 'm3u8'}, }, { # Geo-restricted to Australia; upgrading the m3u8 quality fails and we need the fallback - 'url': 'https://10play.com.au/tiny-chef-show/episodes/season-1/episode-2/tpv240228pofvt', + 'url': 'https://10.com.au/tiny-chef-show/episodes/season-1/episode-2/tpv240228pofvt', 'info_dict': { 'id': '9000000000084116', 'ext': 'mp4', @@ -77,6 +77,7 @@ class TenPlayIE(InfoExtractor): }, 'params': {'skip_download': 'm3u8'}, 'expected_warnings': ['Failed to download m3u8 information: HTTP Error 502'], + 'skip': 'video unavailable', }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, @@ -96,7 +97,7 @@ class TenPlayIE(InfoExtractor): def _real_extract(self, url): content_id = self._match_id(url) data = self._download_json( - 'https://10play.com.au/api/v1/videos/' + content_id, content_id) + 'https://10.com.au/api/v1/videos/' + content_id, content_id) video_data = self._download_json( f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}', @@ -137,21 +138,24 @@ def _real_extract(self, url): class TenPlaySeasonIE(InfoExtractor): IE_NAME = '10play:season' - _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?P[^/?#]+)/episodes/(?P[^/?#]+)/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?10(?:play)?\.com\.au/(?P[^/?#]+)/episodes/(?P[^/?#]+)/?(?:$|[?#])' _TESTS = [{ - 'url': 'https://10play.com.au/masterchef/episodes/season-15', + 'url': 'https://10.com.au/masterchef/episodes/season-15', 'info_dict': { 'title': 'Season 15', 'id': 'MTQ2NjMxOQ==', }, 'playlist_mincount': 50, }, { - 'url': 'https://10play.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2024', + 'url': 'https://10.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2024', 'info_dict': { 'title': 'Season 2024', 'id': 'Mjc0OTIw', }, 'playlist_mincount': 159, + }, { + 'url': 'https://10play.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2024', + 'only_matching': True, }] def _entries(self, load_more_url, display_id=None): @@ -172,7 +176,7 @@ def _entries(self, load_more_url, display_id=None): def _real_extract(self, url): show, season = self._match_valid_url(url).group('show', 'season') season_info = self._download_json( - f'https://10play.com.au/api/shows/{show}/episodes/{season}', f'{show}/{season}') + f'https://10.com.au/api/shows/{show}/episodes/{season}', f'{show}/{season}') episodes_carousel = traverse_obj(season_info, ( 'content', 0, 'components', ( diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index fc1f087ac..5968edc60 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -4056,6 +4056,7 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer 'ext': fmt, 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), 'name': sub_name, + 'impersonate': True, STREAMING_DATA_CLIENT_NAME: client_name, }) diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index 555c21ac3..6582038fc 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -313,7 +313,7 @@ def _create_instance(self, cookiejar, legacy_ssl_support=None): max_retries=urllib3.util.retry.Retry(False), ) session.adapters.clear() - session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'}) + session.headers = requests.models.CaseInsensitiveDict() session.mount('https://', http_adapter) session.mount('http://', http_adapter) session.cookies = cookiejar @@ -322,6 +322,7 @@ def _create_instance(self, cookiejar, legacy_ssl_support=None): def _prepare_headers(self, _, headers): add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) + headers.setdefault('Connection', 'keep-alive') def _send(self, request):