Merge branch 'yt-dlp:master' into pr/live-sections

2026-02-23 17:05:58 +00:00 · 2024-07-10 19:08:37 -05:00
parent 66a6e0a686 705f5b84de
commit 724a6cb2cb
18 changed files with 172 additions and 60 deletions
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -2195,9 +2195,8 @@ class YoutubeDL:
                                   or all(f.get('acodec') == 'none' for f in formats)),  # OR, No formats with audio
        }))

-    def _default_format_spec(self, info_dict, download=True):
-        download = download and not self.params.get('simulate')
-        prefer_best = download and (
+    def _default_format_spec(self, info_dict):
+        prefer_best = (
            self.params['outtmpl']['default'] == '-'
            or info_dict.get('is_live') and not self.params.get('live_from_start'))

@@ -2205,7 +2204,7 @@ class YoutubeDL:
            merger = FFmpegMergerPP(self)
            return merger.available and merger.can_merge()

-        if not prefer_best and download and not can_merge():
+        if not prefer_best and not can_merge():
            prefer_best = True
            formats = self._get_formats(info_dict)
            evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
@@ -2964,7 +2963,7 @@ class YoutubeDL:
                    continue

            if format_selector is None:
-                req_format = self._default_format_spec(info_dict, download=download)
+                req_format = self._default_format_spec(info_dict)
                self.write_debug(f'Default format spec: {req_format}')
                format_selector = self.build_format_selector(req_format)

--- a/yt_dlp/extractor/box.py
+++ b/yt_dlp/extractor/box.py
@@ -12,7 +12,7 @@ from ..utils.traversal import traverse_obj


 class BoxIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?'
+    _VALID_URL = r'https?://(?:[^.]+\.)?(?P<service>app|ent)\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?'
    _TESTS = [{
        'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538',
        'md5': '1f81b2fd3960f38a40a3b8823e5fcd43',
@@ -38,10 +38,22 @@ class BoxIE(InfoExtractor):
            'uploader_id': '239068974',
        },
        'params': {'skip_download': 'dash fragment too small'},
+    }, {
+        'url': 'https://thejacksonlaboratory.ent.box.com/s/2x09dm6vcg6y28o0oox1so4l0t8wzt6l/file/1536173056065',
+        'info_dict': {
+            'id': '1536173056065',
+            'ext': 'mp4',
+            'uploader_id': '18523128264',
+            'uploader': 'Lexi Hennigan',
+            'title': 'iPSC Symposium recording part 1.mp4',
+            'timestamp': 1716228343,
+            'upload_date': '20240520',
+        },
+        'params': {'skip_download': 'dash fragment too small'},
    }]

    def _real_extract(self, url):
-        shared_name, file_id = self._match_valid_url(url).groups()
+        shared_name, file_id, service = self._match_valid_url(url).group('shared_name', 'id', 'service')
        webpage = self._download_webpage(url, file_id or shared_name)

        if not file_id:
@@ -57,14 +69,14 @@ class BoxIE(InfoExtractor):
        request_token = self._search_json(
            r'Box\.config\s*=', webpage, 'Box config', file_id)['requestToken']
        access_token = self._download_json(
-            'https://app.box.com/app-api/enduserapp/elements/tokens', file_id,
+            f'https://{service}.box.com/app-api/enduserapp/elements/tokens', file_id,
            'Downloading token JSON metadata',
            data=json.dumps({'fileIDs': [file_id]}).encode(), headers={
                'Content-Type': 'application/json',
                'X-Request-Token': request_token,
                'X-Box-EndUser-API': 'sharedName=' + shared_name,
            })[file_id]['read']
-        shared_link = 'https://app.box.com/s/' + shared_name
+        shared_link = f'https://{service}.box.com/s/{shared_name}'
        f = self._download_json(
            'https://api.box.com/2.0/files/' + file_id, file_id,
            'Downloading file JSON metadata', headers={
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@@ -314,23 +314,11 @@ class SoundcloudBaseIE(InfoExtractor):
                self.write_debug(f'"{identifier}" is not a requested format, skipping')
                continue

-            stream = None
-            for retry in self.RetryManager(fatal=False):
-                try:
-                    stream = self._call_api(
-                        format_url, track_id, f'Downloading {identifier} format info JSON',
-                        query=query, headers=self._HEADERS)
-                except ExtractorError as e:
-                    if isinstance(e.cause, HTTPError) and e.cause.status == 429:
-                        self.report_warning(
-                            'You have reached the API rate limit, which is ~600 requests per '
-                            '10 minutes. Use the --extractor-retries and --retry-sleep options '
-                            'to configure an appropriate retry count and wait time', only_once=True)
-                        retry.error = e.cause
-                    else:
-                        self.report_warning(e.msg)
+            # XXX: if not extract_flat, 429 error must be caught where _extract_info_dict is called
+            stream_url = traverse_obj(self._call_api(
+                format_url, track_id, f'Downloading {identifier} format info JSON',
+                query=query, headers=self._HEADERS), ('url', {url_or_none}))

-            stream_url = traverse_obj(stream, ('url', {url_or_none}))
            if invalid_url(stream_url):
                continue
            format_urls.add(stream_url)
@@ -647,7 +635,17 @@ class SoundcloudIE(SoundcloudBaseIE):
        info = self._call_api(
            info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)

-        return self._extract_info_dict(info, full_title, token)
+        for retry in self.RetryManager():
+            try:
+                return self._extract_info_dict(info, full_title, token)
+            except ExtractorError as e:
+                if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
+                    raise
+                self.report_warning(
+                    'You have reached the API rate limit, which is ~600 requests per '
+                    '10 minutes. Use the --extractor-retries and --retry-sleep options '
+                    'to configure an appropriate retry count and wait time', only_once=True)
+                retry.error = e.cause


 class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -1458,9 +1458,11 @@ class TikTokLiveIE(TikTokBaseIE):

        if webpage:
            data = self._get_sigi_state(webpage, uploader or room_id)
-            room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
-                       or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
-                       or room_id)
+            room_id = (
+                traverse_obj(data, ((
+                    ('LiveRoom', 'liveRoomUserInfo', 'user'),
+                    ('UserModule', 'users', ...)), 'roomId', {str}, any))
+                or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=room_id))
            uploader = uploader or traverse_obj(
                data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
                ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
--- a/yt_dlp/extractor/tv5mondeplus.py
+++ b/yt_dlp/extractor/tv5mondeplus.py
@@ -96,7 +96,7 @@ class TV5MondePlusIE(InfoExtractor):

    def _real_extract(self, url):
        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
+        webpage = self._download_webpage(url, display_id, impersonate=True)

        if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
            self.raise_geo_restricted(countries=['FR'])
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -3159,7 +3159,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):

    def _extract_n_function_name(self, jscode):
        funcname, idx = self._search_regex(
-            r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)',
+            r'''(?x)(?:\.get\("n"\)\)&&\(b=|b=String\.fromCharCode\(110\),c=a\.get\(b\)\)&&\(c=)
+            (?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)''',
            jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
        if not idx:
            return funcname
@@ -3170,7 +3171,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):

    def _extract_n_function_code(self, video_id, player_url):
        player_id = self._extract_player_info(player_url)
-        func_code = self.cache.load('youtube-nsig', player_id, min_ver='2022.09.1')
+        func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09')
        jscode = func_code or self._load_player(video_id, player_url)
        jsi = JSInterpreter(jscode)

@@ -3179,17 +3180,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):

        func_name = self._extract_n_function_name(jscode)

-        # For redundancy
-        func_code = self._search_regex(
-            rf'''(?xs){func_name}\s*=\s*function\s*\((?P<var>[\w$]+)\)\s*
-                     # NB: The end of the regex is intentionally kept strict
-                     {{(?P<code>.+?}}\s*return\ [\w$]+.join\(""\))}};''',
-            jscode, 'nsig function', group=('var', 'code'), default=None)
-        if func_code:
-            func_code = ([func_code[0]], func_code[1])
-        else:
-            self.write_debug('Extracting nsig function with jsinterp')
-            func_code = jsi.extract_function_code(func_name)
+        func_code = jsi.extract_function_code(func_name)

        self.cache.store('youtube-nsig', player_id, func_code)
        return jsi, player_id, func_code
--- a/yt_dlp/jsinterp.py
+++ b/yt_dlp/jsinterp.py
@@ -636,6 +636,8 @@ class JSInterpreter:
                    raise self.Exception(f'{member} {msg}', expr)

            def eval_method():
+                nonlocal member
+
                if (variable, member) == ('console', 'debug'):
                    if Debugger.ENABLED:
                        Debugger.write(self.interpret_expression(f'[{arg_str}]', local_vars, allow_recursion))
@@ -644,6 +646,7 @@ class JSInterpreter:
                types = {
                    'String': str,
                    'Math': float,
+                    'Array': list,
                }
                obj = local_vars.get(variable, types.get(variable, NO_DEFAULT))
                if obj is NO_DEFAULT:
@@ -667,6 +670,21 @@ class JSInterpreter:
                    self.interpret_expression(v, local_vars, allow_recursion)
                    for v in self._separate(arg_str)]

+                # Fixup prototype call
+                if isinstance(obj, type) and member.startswith('prototype.'):
+                    new_member, _, func_prototype = member.partition('.')[2].partition('.')
+                    assertion(argvals, 'takes one or more arguments')
+                    assertion(isinstance(argvals[0], obj), f'needs binding to type {obj}')
+                    if func_prototype == 'call':
+                        obj, *argvals = argvals
+                    elif func_prototype == 'apply':
+                        assertion(len(argvals) == 2, 'takes two arguments')
+                        obj, argvals = argvals
+                        assertion(isinstance(argvals, list), 'second argument needs to be a list')
+                    else:
+                        raise self.Exception(f'Unsupported Function method {func_prototype}', expr)
+                    member = new_member
+
                if obj is str:
                    if member == 'fromCharCode':
                        assertion(argvals, 'takes one or more arguments')
--- a/yt_dlp/networking/_curlcffi.py
+++ b/yt_dlp/networking/_curlcffi.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import io
 import math
+import re
 import urllib.parse

 from ._helper import InstanceStoreMixin, select_proxy
@@ -27,11 +28,12 @@ from ..utils import int_or_none
 if curl_cffi is None:
    raise ImportError('curl_cffi is not installed')

-curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.'))

-if curl_cffi_version != (0, 5, 10):
+curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3]))
+
+if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 8, 0)):
    curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'
-    raise ImportError('Only curl_cffi 0.5.10 is supported')
+    raise ImportError('Only curl_cffi versions 0.5.10, 0.7.X are supported')

 import curl_cffi.requests
 from curl_cffi.const import CurlECode, CurlOpt
@@ -110,6 +112,13 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
    _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
    _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
    _SUPPORTED_IMPERSONATE_TARGET_MAP = {
+        **({
+            ImpersonateTarget('chrome', '124', 'macos', '14'): curl_cffi.requests.BrowserType.chrome124,
+            ImpersonateTarget('chrome', '123', 'macos', '14'): curl_cffi.requests.BrowserType.chrome123,
+            ImpersonateTarget('chrome', '120', 'macos', '14'): curl_cffi.requests.BrowserType.chrome120,
+            ImpersonateTarget('chrome', '119', 'macos', '14'): curl_cffi.requests.BrowserType.chrome119,
+            ImpersonateTarget('chrome', '116', 'windows', '10'): curl_cffi.requests.BrowserType.chrome116,
+        } if curl_cffi_version >= (0, 7, 0) else {}),
        ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110,
        ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107,
        ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104,
@@ -118,9 +127,15 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
        ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99,
        ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101,
        ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99,
+        **({
+            ImpersonateTarget('safari', '17.0', 'macos', '14'): curl_cffi.requests.BrowserType.safari17_0,
+        } if curl_cffi_version >= (0, 7, 0) else {}),
        ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5,
        ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3,
        ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android,
+        **({
+            ImpersonateTarget('safari', '17.2', 'ios', '17.2'): curl_cffi.requests.BrowserType.safari17_2_ios,
+        } if curl_cffi_version >= (0, 7, 0) else {}),
    }

    def _create_instance(self, cookiejar=None):
@@ -187,7 +202,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
        timeout = self._calculate_timeout(request)

        # set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1]
-        # curl_cffi does not currently do this. [2]
+        # This is required only for 0.5.10 [2]
        # Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3]
        # [1] https://unix.stackexchange.com/a/305311
        # [2] https://github.com/yifeikong/curl_cffi/issues/156
@@ -203,7 +218,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
                data=request.data,
                verify=self.verify,
                max_redirects=5,
-                timeout=timeout,
+                timeout=(timeout, timeout),
                impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get(
                    self._get_request_target(request)),
                interface=self.source_address,
@@ -222,7 +237,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):

            elif (
                e.code == CurlECode.PROXY
-                or (e.code == CurlECode.RECV_ERROR and 'Received HTTP code 407 from proxy after CONNECT' in str(e))
+                or (e.code == CurlECode.RECV_ERROR and 'CONNECT' in str(e))
            ):
                raise ProxyError(cause=e) from e
            else:
--- a/yt_dlp/version.py
+++ b/yt_dlp/version.py
@@ -1,8 +1,8 @@
 # Autogenerated by devscripts/update-version.py

-__version__ = '2024.07.07'
+__version__ = '2024.07.09'

-RELEASE_GIT_HEAD = 'b337d2989ce0614651d363383f6f743d977248ef'
+RELEASE_GIT_HEAD = '7ead7332af69422cee931aec3faa277288e9e212'

 VARIANT = None

@@ -12,4 +12,4 @@ CHANNEL = 'stable'

 ORIGIN = 'yt-dlp/yt-dlp'

-_pkg_version = '2024.07.07'
+_pkg_version = '2024.07.09'