1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2026-01-05 22:41:18 +00:00

Merge branch 'yt-dlp:master' into pr/live-sections

This commit is contained in:
bashonly
2024-07-10 19:08:37 -05:00
18 changed files with 172 additions and 60 deletions

View File

@@ -2195,9 +2195,8 @@ class YoutubeDL:
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
def _default_format_spec(self, info_dict, download=True):
download = download and not self.params.get('simulate')
prefer_best = download and (
def _default_format_spec(self, info_dict):
prefer_best = (
self.params['outtmpl']['default'] == '-'
or info_dict.get('is_live') and not self.params.get('live_from_start'))
@@ -2205,7 +2204,7 @@ class YoutubeDL:
merger = FFmpegMergerPP(self)
return merger.available and merger.can_merge()
if not prefer_best and download and not can_merge():
if not prefer_best and not can_merge():
prefer_best = True
formats = self._get_formats(info_dict)
evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
@@ -2964,7 +2963,7 @@ class YoutubeDL:
continue
if format_selector is None:
req_format = self._default_format_spec(info_dict, download=download)
req_format = self._default_format_spec(info_dict)
self.write_debug(f'Default format spec: {req_format}')
format_selector = self.build_format_selector(req_format)

View File

@@ -12,7 +12,7 @@ from ..utils.traversal import traverse_obj
class BoxIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?'
_VALID_URL = r'https?://(?:[^.]+\.)?(?P<service>app|ent)\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?'
_TESTS = [{
'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538',
'md5': '1f81b2fd3960f38a40a3b8823e5fcd43',
@@ -38,10 +38,22 @@ class BoxIE(InfoExtractor):
'uploader_id': '239068974',
},
'params': {'skip_download': 'dash fragment too small'},
}, {
'url': 'https://thejacksonlaboratory.ent.box.com/s/2x09dm6vcg6y28o0oox1so4l0t8wzt6l/file/1536173056065',
'info_dict': {
'id': '1536173056065',
'ext': 'mp4',
'uploader_id': '18523128264',
'uploader': 'Lexi Hennigan',
'title': 'iPSC Symposium recording part 1.mp4',
'timestamp': 1716228343,
'upload_date': '20240520',
},
'params': {'skip_download': 'dash fragment too small'},
}]
def _real_extract(self, url):
shared_name, file_id = self._match_valid_url(url).groups()
shared_name, file_id, service = self._match_valid_url(url).group('shared_name', 'id', 'service')
webpage = self._download_webpage(url, file_id or shared_name)
if not file_id:
@@ -57,14 +69,14 @@ class BoxIE(InfoExtractor):
request_token = self._search_json(
r'Box\.config\s*=', webpage, 'Box config', file_id)['requestToken']
access_token = self._download_json(
'https://app.box.com/app-api/enduserapp/elements/tokens', file_id,
f'https://{service}.box.com/app-api/enduserapp/elements/tokens', file_id,
'Downloading token JSON metadata',
data=json.dumps({'fileIDs': [file_id]}).encode(), headers={
'Content-Type': 'application/json',
'X-Request-Token': request_token,
'X-Box-EndUser-API': 'sharedName=' + shared_name,
})[file_id]['read']
shared_link = 'https://app.box.com/s/' + shared_name
shared_link = f'https://{service}.box.com/s/{shared_name}'
f = self._download_json(
'https://api.box.com/2.0/files/' + file_id, file_id,
'Downloading file JSON metadata', headers={

View File

@@ -314,23 +314,11 @@ class SoundcloudBaseIE(InfoExtractor):
self.write_debug(f'"{identifier}" is not a requested format, skipping')
continue
stream = None
for retry in self.RetryManager(fatal=False):
try:
stream = self._call_api(
format_url, track_id, f'Downloading {identifier} format info JSON',
query=query, headers=self._HEADERS)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 429:
self.report_warning(
'You have reached the API rate limit, which is ~600 requests per '
'10 minutes. Use the --extractor-retries and --retry-sleep options '
'to configure an appropriate retry count and wait time', only_once=True)
retry.error = e.cause
else:
self.report_warning(e.msg)
# XXX: if not extract_flat, 429 error must be caught where _extract_info_dict is called
stream_url = traverse_obj(self._call_api(
format_url, track_id, f'Downloading {identifier} format info JSON',
query=query, headers=self._HEADERS), ('url', {url_or_none}))
stream_url = traverse_obj(stream, ('url', {url_or_none}))
if invalid_url(stream_url):
continue
format_urls.add(stream_url)
@@ -647,7 +635,17 @@ class SoundcloudIE(SoundcloudBaseIE):
info = self._call_api(
info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
return self._extract_info_dict(info, full_title, token)
for retry in self.RetryManager():
try:
return self._extract_info_dict(info, full_title, token)
except ExtractorError as e:
if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
raise
self.report_warning(
'You have reached the API rate limit, which is ~600 requests per '
'10 minutes. Use the --extractor-retries and --retry-sleep options '
'to configure an appropriate retry count and wait time', only_once=True)
retry.error = e.cause
class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):

View File

@@ -1458,9 +1458,11 @@ class TikTokLiveIE(TikTokBaseIE):
if webpage:
data = self._get_sigi_state(webpage, uploader or room_id)
room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
or room_id)
room_id = (
traverse_obj(data, ((
('LiveRoom', 'liveRoomUserInfo', 'user'),
('UserModule', 'users', ...)), 'roomId', {str}, any))
or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=room_id))
uploader = uploader or traverse_obj(
data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)

View File

@@ -96,7 +96,7 @@ class TV5MondePlusIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
webpage = self._download_webpage(url, display_id, impersonate=True)
if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
self.raise_geo_restricted(countries=['FR'])

View File

@@ -3159,7 +3159,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_name(self, jscode):
funcname, idx = self._search_regex(
r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)',
r'''(?x)(?:\.get\("n"\)\)&&\(b=|b=String\.fromCharCode\(110\),c=a\.get\(b\)\)&&\(c=)
(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)''',
jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
if not idx:
return funcname
@@ -3170,7 +3171,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_code(self, video_id, player_url):
player_id = self._extract_player_info(player_url)
func_code = self.cache.load('youtube-nsig', player_id, min_ver='2022.09.1')
func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09')
jscode = func_code or self._load_player(video_id, player_url)
jsi = JSInterpreter(jscode)
@@ -3179,17 +3180,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
func_name = self._extract_n_function_name(jscode)
# For redundancy
func_code = self._search_regex(
rf'''(?xs){func_name}\s*=\s*function\s*\((?P<var>[\w$]+)\)\s*
# NB: The end of the regex is intentionally kept strict
{{(?P<code>.+?}}\s*return\ [\w$]+.join\(""\))}};''',
jscode, 'nsig function', group=('var', 'code'), default=None)
if func_code:
func_code = ([func_code[0]], func_code[1])
else:
self.write_debug('Extracting nsig function with jsinterp')
func_code = jsi.extract_function_code(func_name)
func_code = jsi.extract_function_code(func_name)
self.cache.store('youtube-nsig', player_id, func_code)
return jsi, player_id, func_code

View File

@@ -636,6 +636,8 @@ class JSInterpreter:
raise self.Exception(f'{member} {msg}', expr)
def eval_method():
nonlocal member
if (variable, member) == ('console', 'debug'):
if Debugger.ENABLED:
Debugger.write(self.interpret_expression(f'[{arg_str}]', local_vars, allow_recursion))
@@ -644,6 +646,7 @@ class JSInterpreter:
types = {
'String': str,
'Math': float,
'Array': list,
}
obj = local_vars.get(variable, types.get(variable, NO_DEFAULT))
if obj is NO_DEFAULT:
@@ -667,6 +670,21 @@ class JSInterpreter:
self.interpret_expression(v, local_vars, allow_recursion)
for v in self._separate(arg_str)]
# Fixup prototype call
if isinstance(obj, type) and member.startswith('prototype.'):
new_member, _, func_prototype = member.partition('.')[2].partition('.')
assertion(argvals, 'takes one or more arguments')
assertion(isinstance(argvals[0], obj), f'needs binding to type {obj}')
if func_prototype == 'call':
obj, *argvals = argvals
elif func_prototype == 'apply':
assertion(len(argvals) == 2, 'takes two arguments')
obj, argvals = argvals
assertion(isinstance(argvals, list), 'second argument needs to be a list')
else:
raise self.Exception(f'Unsupported Function method {func_prototype}', expr)
member = new_member
if obj is str:
if member == 'fromCharCode':
assertion(argvals, 'takes one or more arguments')

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import io
import math
import re
import urllib.parse
from ._helper import InstanceStoreMixin, select_proxy
@@ -27,11 +28,12 @@ from ..utils import int_or_none
if curl_cffi is None:
raise ImportError('curl_cffi is not installed')
curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.'))
if curl_cffi_version != (0, 5, 10):
curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3]))
if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 8, 0)):
curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'
raise ImportError('Only curl_cffi 0.5.10 is supported')
raise ImportError('Only curl_cffi versions 0.5.10, 0.7.X are supported')
import curl_cffi.requests
from curl_cffi.const import CurlECode, CurlOpt
@@ -110,6 +112,13 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
_SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_IMPERSONATE_TARGET_MAP = {
**({
ImpersonateTarget('chrome', '124', 'macos', '14'): curl_cffi.requests.BrowserType.chrome124,
ImpersonateTarget('chrome', '123', 'macos', '14'): curl_cffi.requests.BrowserType.chrome123,
ImpersonateTarget('chrome', '120', 'macos', '14'): curl_cffi.requests.BrowserType.chrome120,
ImpersonateTarget('chrome', '119', 'macos', '14'): curl_cffi.requests.BrowserType.chrome119,
ImpersonateTarget('chrome', '116', 'windows', '10'): curl_cffi.requests.BrowserType.chrome116,
} if curl_cffi_version >= (0, 7, 0) else {}),
ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110,
ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107,
ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104,
@@ -118,9 +127,15 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99,
ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101,
ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99,
**({
ImpersonateTarget('safari', '17.0', 'macos', '14'): curl_cffi.requests.BrowserType.safari17_0,
} if curl_cffi_version >= (0, 7, 0) else {}),
ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5,
ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3,
ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android,
**({
ImpersonateTarget('safari', '17.2', 'ios', '17.2'): curl_cffi.requests.BrowserType.safari17_2_ios,
} if curl_cffi_version >= (0, 7, 0) else {}),
}
def _create_instance(self, cookiejar=None):
@@ -187,7 +202,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
timeout = self._calculate_timeout(request)
# set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1]
# curl_cffi does not currently do this. [2]
# This is required only for 0.5.10 [2]
# Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3]
# [1] https://unix.stackexchange.com/a/305311
# [2] https://github.com/yifeikong/curl_cffi/issues/156
@@ -203,7 +218,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
data=request.data,
verify=self.verify,
max_redirects=5,
timeout=timeout,
timeout=(timeout, timeout),
impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get(
self._get_request_target(request)),
interface=self.source_address,
@@ -222,7 +237,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
elif (
e.code == CurlECode.PROXY
or (e.code == CurlECode.RECV_ERROR and 'Received HTTP code 407 from proxy after CONNECT' in str(e))
or (e.code == CurlECode.RECV_ERROR and 'CONNECT' in str(e))
):
raise ProxyError(cause=e) from e
else:

View File

@@ -1,8 +1,8 @@
# Autogenerated by devscripts/update-version.py
__version__ = '2024.07.07'
__version__ = '2024.07.09'
RELEASE_GIT_HEAD = 'b337d2989ce0614651d363383f6f743d977248ef'
RELEASE_GIT_HEAD = '7ead7332af69422cee931aec3faa277288e9e212'
VARIANT = None
@@ -12,4 +12,4 @@ CHANNEL = 'stable'
ORIGIN = 'yt-dlp/yt-dlp'
_pkg_version = '2024.07.07'
_pkg_version = '2024.07.09'