Merge branch 'yt-dlp:master' into niconico_error

2025-08-15 08:58:28 +00:00 · 2025-07-21 14:01:58 +09:00 · 2025-07-21 14:01:58 +09:00 · 6eaeebd5a7
commit 6eaeebd5a7
parent be07203b75 8820101aa3
9 changed files with 83 additions and 35 deletions
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -52,7 +52,7 @@
    SSLError,
    network_exceptions,
 )
-from .networking.impersonate import ImpersonateRequestHandler
+from .networking.impersonate import ImpersonateRequestHandler, ImpersonateTarget
 from .plugins import directories as plugin_directories, load_all_plugins
 from .postprocessor import (
    EmbedThumbnailPP,
@ -3231,6 +3231,16 @@ def dl(self, name, info, subtitle=False, test=False):
            }
        else:
            params = self.params
+
+        impersonate = info.pop('impersonate', None)
+        # Do not override --impersonate with extractor-specified impersonation
+        if params.get('impersonate') is None:
+            available_target, requested_targets = self._parse_impersonate_targets(impersonate)
+            if available_target:
+                info['impersonate'] = available_target
+            elif requested_targets:
+                self.report_warning(self._unavailable_targets_message(requested_targets), only_once=True)
+
        fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
        if not test:
            for ph in self._progress_hooks:
@ -4183,6 +4193,31 @@ def _impersonate_target_available(self, target):
            for rh in self._request_director.handlers.values()
            if isinstance(rh, ImpersonateRequestHandler))

+    def _parse_impersonate_targets(self, impersonate):
+        if impersonate in (True, ''):
+            impersonate = ImpersonateTarget()
+
+        requested_targets = [
+            t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
+            for t in variadic(impersonate)
+        ] if impersonate else []
+
+        available_target = next(filter(self._impersonate_target_available, requested_targets), None)
+
+        return available_target, requested_targets
+
+    @staticmethod
+    def _unavailable_targets_message(requested_targets, note=None, is_error=False):
+        note = note or 'The extractor specified to use impersonation for this download'
+        specific_targets = ', '.join(filter(None, map(str, requested_targets)))
+        message = (
+            'no impersonate target is available' if not specific_targets
+            else f'none of these impersonate targets are available: {specific_targets}')
+        return (
+            f'{note}, but {message}. {"See" if is_error else "If you encounter errors, then see"}'
+            f'  https://github.com/yt-dlp/yt-dlp#impersonation  '
+            f'for information on installing the required dependencies')
+
    def urlopen(self, req):
        """ Start an HTTP download """
        if isinstance(req, str):
--- a/yt_dlp/downloader/init.py
+++ b/yt_dlp/downloader/init.py
@ -99,7 +99,7 @@ def _get_suitable_downloader(info_dict, protocol, params, default):
    if external_downloader is None:
        if info_dict['to_stdout'] and FFmpegFD.can_merge_formats(info_dict, params):
            return FFmpegFD
-    elif external_downloader.lower() != 'native':
+    elif external_downloader.lower() != 'native' and info_dict.get('impersonate') is None:
        ed = get_external_downloader(external_downloader)
        if ed.can_download(info_dict, external_downloader):
            return ed
--- a/yt_dlp/downloader/http.py
+++ b/yt_dlp/downloader/http.py
@ -27,6 +27,9 @@ class HttpFD(FileDownloader):
    def real_download(self, filename, info_dict):
        url = info_dict['url']
        request_data = info_dict.get('request_data', None)
+        request_extensions = {}
+        if info_dict.get('impersonate') is not None:
+            request_extensions['impersonate'] = info_dict['impersonate']

        class DownloadContext(dict):
            __getattr__ = dict.get
@ -109,7 +112,7 @@ def establish_connection():
            if try_call(lambda: range_end >= ctx.content_len):
                range_end = ctx.content_len - 1

-            request = Request(url, request_data, headers)
+            request = Request(url, request_data, headers, extensions=request_extensions)
            has_range = range_start is not None
            if has_range:
                request.headers['Range'] = f'bytes={int(range_start)}-{int_or_none(range_end) or ""}'
--- a/yt_dlp/extractor/bandcamp.py
+++ b/yt_dlp/extractor/bandcamp.py
@ -7,6 +7,7 @@
 from ..utils import (
    KNOWN_EXTENSIONS,
    ExtractorError,
+    clean_html,
    extract_attributes,
    float_or_none,
    int_or_none,
@ -19,7 +20,7 @@
    url_or_none,
    urljoin,
 )
-from ..utils.traversal import find_element, traverse_obj
+from ..utils.traversal import find_element, find_elements, traverse_obj


 class BandcampIE(InfoExtractor):
@ -70,6 +71,9 @@ class BandcampIE(InfoExtractor):
            'album': 'FTL: Advanced Edition Soundtrack',
            'uploader_url': 'https://benprunty.bandcamp.com',
            'uploader_id': 'benprunty',
+            'tags': ['soundtrack', 'chiptunes', 'cinematic', 'electronic', 'video game music', 'California'],
+            'artists': ['Ben Prunty'],
+            'album_artists': ['Ben Prunty'],
        },
    }, {
        # no free download, mp3 128
@ -94,6 +98,9 @@ class BandcampIE(InfoExtractor):
            'album': 'Call of the Mastodon',
            'uploader_url': 'https://relapsealumni.bandcamp.com',
            'uploader_id': 'relapsealumni',
+            'tags': ['Philadelphia'],
+            'artists': ['Mastodon'],
+            'album_artists': ['Mastodon'],
        },
    }, {
        # track from compilation album (artist/album_artist difference)
@ -118,6 +125,9 @@ class BandcampIE(InfoExtractor):
            'album': 'DSK F/W 2016-2017 Free Compilation',
            'uploader_url': 'https://diskotopia.bandcamp.com',
            'uploader_id': 'diskotopia',
+            'tags': ['Japan'],
+            'artists': ['submerse'],
+            'album_artists': ['Diskotopia'],
        },
    }]

@ -252,6 +262,7 @@ def _real_extract(self, url):
            'album': embed.get('album_title'),
            'album_artist': album_artist,
            'formats': formats,
+            'tags': traverse_obj(webpage, ({find_elements(cls='tag')}, ..., {clean_html})),
        }


--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -38,7 +38,6 @@
    TransportError,
    network_exceptions,
 )
-from ..networking.impersonate import ImpersonateTarget
 from ..utils import (
    IDENTITY,
    JSON_LD_RE,
@ -259,6 +258,11 @@ class InfoExtractor:
                                 * key  The key (as hex) used to decrypt fragments.
                                        If `key` is given, any key URI will be ignored
                                 * iv   The IV (as hex) used to decrypt fragments
+                    * impersonate  Impersonate target(s). Can be any of the following entities:
+                                * an instance of yt_dlp.networking.impersonate.ImpersonateTarget
+                                * a string in the format of CLIENT[:OS]
+                                * a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
+                                * a boolean value; True means any impersonate target is sufficient
                    * downloader_options  A dictionary of downloader options
                                 (For internal use only)
                                 * http_chunk_size Chunk size for HTTP downloads
@ -336,6 +340,7 @@ class InfoExtractor:
                        * "name": Name or description of the subtitles
                        * "http_headers": A dictionary of additional HTTP headers
                                  to add to the request.
+                        * "impersonate": Impersonate target(s); same as the "formats" field
                    "ext" will be calculated from URL if missing
    automatic_captions: Like 'subtitles'; contains automatically generated
                    captions instead of normal subtitles
@ -884,26 +889,17 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa

        extensions = {}

-        if impersonate in (True, ''):
-            impersonate = ImpersonateTarget()
-        requested_targets = [
-            t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
-            for t in variadic(impersonate)
-        ] if impersonate else []
-
-        available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
+        available_target, requested_targets = self._downloader._parse_impersonate_targets(impersonate)
        if available_target:
            extensions['impersonate'] = available_target
        elif requested_targets:
-            message = 'The extractor is attempting impersonation, but '
-            message += (
-                'no impersonate target is available' if not str(impersonate)
-                else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
-            info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
-                        'for information on installing the required dependencies')
+            msg = 'The extractor is attempting impersonation'
            if require_impersonation:
-                raise ExtractorError(f'{message}; {info_msg}', expected=True)
-            self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
+                raise ExtractorError(
+                    self._downloader._unavailable_targets_message(requested_targets, note=msg, is_error=True),
+                    expected=True)
+            self.report_warning(
+                self._downloader._unavailable_targets_message(requested_targets, note=msg), only_once=True)

        try:
            return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
--- a/yt_dlp/extractor/mlb.py
+++ b/yt_dlp/extractor/mlb.py
@ -457,12 +457,9 @@ def _extract_formats_and_subtitles(self, broadcast, video_id):
                self.report_warning(f'No formats available for {format_id} broadcast; skipping')
            return [], {}

-        cdn_headers = {'x-cdn-token': token}
        fmts, subs = self._extract_m3u8_formats_and_subtitles(
-            m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4',
-            m3u8_id=format_id, fatal=False, headers=cdn_headers)
+            m3u8_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
        for fmt in fmts:
-            fmt['http_headers'] = cdn_headers
            fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' '))
            fmt.setdefault('language', language)
            if fmt.get('vcodec') == 'none' and fmt['language'] == 'en':
--- a/yt_dlp/extractor/tenplay.py
+++ b/yt_dlp/extractor/tenplay.py
@ -7,11 +7,11 @@

 class TenPlayIE(InfoExtractor):
    IE_NAME = '10play'
-    _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/?#]+/)+(?P<id>tpv\d{6}[a-z]{5})'
+    _VALID_URL = r'https?://(?:www\.)?10(?:play)?\.com\.au/(?:[^/?#]+/)+(?P<id>tpv\d{6}[a-z]{5})'
    _NETRC_MACHINE = '10play'
    _TESTS = [{
        # Geo-restricted to Australia
-        'url': 'https://10play.com.au/australian-survivor/web-extras/season-10-brains-v-brawn-ii/myless-journey/tpv250414jdmtf',
+        'url': 'https://10.com.au/australian-survivor/web-extras/season-10-brains-v-brawn-ii/myless-journey/tpv250414jdmtf',
        'info_dict': {
            'id': '7440980000013868',
            'ext': 'mp4',
@ -32,7 +32,7 @@ class TenPlayIE(InfoExtractor):
        'params': {'skip_download': 'm3u8'},
    }, {
        # Geo-restricted to Australia
-        'url': 'https://10play.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp',
+        'url': 'https://10.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp',
        'info_dict': {
            'id': '9000000000091177',
            'ext': 'mp4',
@ -55,7 +55,7 @@ class TenPlayIE(InfoExtractor):
        'params': {'skip_download': 'm3u8'},
    }, {
        # Geo-restricted to Australia; upgrading the m3u8 quality fails and we need the fallback
-        'url': 'https://10play.com.au/tiny-chef-show/episodes/season-1/episode-2/tpv240228pofvt',
+        'url': 'https://10.com.au/tiny-chef-show/episodes/season-1/episode-2/tpv240228pofvt',
        'info_dict': {
            'id': '9000000000084116',
            'ext': 'mp4',
@ -77,6 +77,7 @@ class TenPlayIE(InfoExtractor):
        },
        'params': {'skip_download': 'm3u8'},
        'expected_warnings': ['Failed to download m3u8 information: HTTP Error 502'],
+        'skip': 'video unavailable',
    }, {
        'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc',
        'only_matching': True,
@ -96,7 +97,7 @@ class TenPlayIE(InfoExtractor):
    def _real_extract(self, url):
        content_id = self._match_id(url)
        data = self._download_json(
-            'https://10play.com.au/api/v1/videos/' + content_id, content_id)
+            'https://10.com.au/api/v1/videos/' + content_id, content_id)

        video_data = self._download_json(
            f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}',
@ -137,21 +138,24 @@ def _real_extract(self, url):

 class TenPlaySeasonIE(InfoExtractor):
    IE_NAME = '10play:season'
-    _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?P<show>[^/?#]+)/episodes/(?P<season>[^/?#]+)/?(?:$|[?#])'
+    _VALID_URL = r'https?://(?:www\.)?10(?:play)?\.com\.au/(?P<show>[^/?#]+)/episodes/(?P<season>[^/?#]+)/?(?:$|[?#])'
    _TESTS = [{
-        'url': 'https://10play.com.au/masterchef/episodes/season-15',
+        'url': 'https://10.com.au/masterchef/episodes/season-15',
        'info_dict': {
            'title': 'Season 15',
            'id': 'MTQ2NjMxOQ==',
        },
        'playlist_mincount': 50,
    }, {
-        'url': 'https://10play.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2024',
+        'url': 'https://10.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2024',
        'info_dict': {
            'title': 'Season 2024',
            'id': 'Mjc0OTIw',
        },
        'playlist_mincount': 159,
+    }, {
+        'url': 'https://10play.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2024',
+        'only_matching': True,
    }]

    def _entries(self, load_more_url, display_id=None):
@ -172,7 +176,7 @@ def _entries(self, load_more_url, display_id=None):
    def _real_extract(self, url):
        show, season = self._match_valid_url(url).group('show', 'season')
        season_info = self._download_json(
-            f'https://10play.com.au/api/shows/{show}/episodes/{season}', f'{show}/{season}')
+            f'https://10.com.au/api/shows/{show}/episodes/{season}', f'{show}/{season}')

        episodes_carousel = traverse_obj(season_info, (
            'content', 0, 'components', (
--- a/yt_dlp/extractor/youtube/_video.py
+++ b/yt_dlp/extractor/youtube/_video.py
@ -4056,6 +4056,7 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer
                    'ext': fmt,
                    'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)),
                    'name': sub_name,
+                    'impersonate': True,
                    STREAMING_DATA_CLIENT_NAME: client_name,
                })

--- a/yt_dlp/networking/_requests.py
+++ b/yt_dlp/networking/_requests.py
@ -313,7 +313,7 @@ def _create_instance(self, cookiejar, legacy_ssl_support=None):
            max_retries=urllib3.util.retry.Retry(False),
        )
        session.adapters.clear()
-        session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
+        session.headers = requests.models.CaseInsensitiveDict()
        session.mount('https://', http_adapter)
        session.mount('http://', http_adapter)
        session.cookies = cookiejar
@ -322,6 +322,7 @@ def _create_instance(self, cookiejar, legacy_ssl_support=None):

    def _prepare_headers(self, _, headers):
        add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
+        headers.setdefault('Connection', 'keep-alive')

    def _send(self, request):