1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-06-28 01:18:30 +00:00
This commit is contained in:
c-basalt 2025-06-18 15:20:15 +12:00 committed by GitHub
commit 040e744f6a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 1562 additions and 275 deletions

View File

@ -213,7 +213,7 @@ ### Metadata
### Misc ### Misc
* [**pycryptodomex**](https://github.com/Legrandin/pycryptodome)\* - For decrypting AES-128 HLS streams and various other data. Licensed under [BSD-2-Clause](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst) * [**pycryptodomex**](https://github.com/Legrandin/pycryptodome)\* - For decrypting AES-128 HLS streams and various other data. Licensed under [BSD-2-Clause](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst)
* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in extractors where javascript needs to be run. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) * [**phantomjs**](https://github.com/ariya/phantomjs), [**deno**](https://github.com/denoland/deno/) - Used in extractors where javascript needs to be run. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) and [MIT](https://github.com/xattr/xattr/blob/master/LICENSE.txt) respectively
* [**secretstorage**](https://github.com/mitya57/secretstorage)\* - For `--cookies-from-browser` to access the **Gnome** keyring while decrypting cookies of **Chromium**-based browsers on **Linux**. Licensed under [BSD-3-Clause](https://github.com/mitya57/secretstorage/blob/master/LICENSE) * [**secretstorage**](https://github.com/mitya57/secretstorage)\* - For `--cookies-from-browser` to access the **Gnome** keyring while decrypting cookies of **Chromium**-based browsers on **Linux**. Licensed under [BSD-3-Clause](https://github.com/mitya57/secretstorage/blob/master/LICENSE)
* Any external downloader that you want to use with `--downloader` * Any external downloader that you want to use with `--downloader`
@ -798,6 +798,9 @@ ## Workarounds:
be used along with --min-sleep-interval be used along with --min-sleep-interval
--sleep-subtitles SECONDS Number of seconds to sleep before each --sleep-subtitles SECONDS Number of seconds to sleep before each
subtitle download subtitle download
--jsi-preference JSI Preferred JS interpreters to use during
extraction. Can be given as comma-separated
values
## Video Format Options: ## Video Format Options:
-f, --format FORMAT Video format code, see "FORMAT SELECTION" -f, --format FORMAT Video format code, see "FORMAT SELECTION"

View File

@ -25,12 +25,14 @@
import yt_dlp.YoutubeDL # isort: split import yt_dlp.YoutubeDL # isort: split
from yt_dlp.extractor import get_info_extractor from yt_dlp.extractor import get_info_extractor
from yt_dlp.jsinterp.common import get_included_jsi
from yt_dlp.networking.exceptions import HTTPError, TransportError from yt_dlp.networking.exceptions import HTTPError, TransportError
from yt_dlp.utils import ( from yt_dlp.utils import (
DownloadError, DownloadError,
ExtractorError, ExtractorError,
UnavailableVideoError, UnavailableVideoError,
YoutubeDLError, YoutubeDLError,
filter_dict,
format_bytes, format_bytes,
join_nonempty, join_nonempty,
) )
@ -82,6 +84,29 @@ def __str__(self):
# Dynamically generate tests # Dynamically generate tests
def generator(test_case, tname): def generator(test_case, tname):
# setting `jsi_matrix` to True, or `jsi_matrix_only_include`, `jsi_matrix_exclude` to non-empty list
# to trigger matrix behavior for JSI
if any(test_case.get(key) for key in [
'jsi_matrix', 'jsi_matrix_only_include', 'jsi_matrix_exclude',
]):
jsi_keys = list(get_included_jsi(only_include=test_case.get('jsi_matrix_only_include'),
exclude=test_case.get('jsi_matrix_exclude')))
# use jsi_preference here, instead of force blocking other jsi runtimes
# exclusion, if needed, should be specified in test case to optimize testing
def generate_jsi_sub_case(jsi_key):
sub_case = filter_dict(test_case, lambda k, _: not k.startswith('jsi_matrix'))
sub_case['params'] = {**test_case.get('params', {}), 'jsi_preference': [jsi_key]}
return generator(sub_case, f'{tname}_{jsi_key}')
def run_sub_cases(self):
for i, jsi_key in enumerate(jsi_keys):
with self.subTest(jsi_key):
print(f'Running case {tname} using JSI: {jsi_key} ({i + 1}/{len(jsi_keys)})')
generate_jsi_sub_case(jsi_key)(self)
return run_sub_cases
def test_template(self): def test_template(self):
if self.COMPLETED_TESTS.get(tname): if self.COMPLETED_TESTS.get(tname):
return return

206
test/test_jsi_external.py Normal file
View File

@ -0,0 +1,206 @@
#!/usr/bin/env python3
from __future__ import annotations
import os
import dataclasses
import datetime
import time
import sys
import unittest
import http.cookiejar
import functools
import typing
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL
from yt_dlp.cookies import YoutubeDLCookieJar
from yt_dlp.jsinterp.common import get_included_jsi
from yt_dlp.jsinterp._helper import prepare_wasm_jsmodule
if typing.TYPE_CHECKING:
from yt_dlp.jsinterp.common import JSI
@dataclasses.dataclass
class NetscapeFields:
name: str
value: str
domain: str
path: str
secure: bool
expires: int | None
def to_cookie(self):
return http.cookiejar.Cookie(
0, self.name, self.value,
None, False,
self.domain, True, self.domain.startswith('.'),
self.path, True,
self.secure, self.expires, False,
None, None, {},
)
def expire_str(self):
return datetime.datetime.fromtimestamp(
self.expires, datetime.timezone.utc).strftime('%a, %d %b %Y %H:%M:%S GMT')
def __eq__(self, other: NetscapeFields | http.cookiejar.Cookie):
return all(getattr(self, attr) == getattr(other, attr) for attr in ['name', 'value', 'domain', 'path', 'secure', 'expires'])
def use_jsi_rumtimes(exclude=[]):
def inner(func: typing.Callable[[unittest.TestCase, type[JSI]], None]):
@functools.wraps(func)
def wrapper(self: unittest.TestCase):
for key, jsi in get_included_jsi(exclude=exclude).items():
def wrapped_jsi_with_unavaliable_auto_skip(*args, **kwargs):
if getattr(jsi, 'TEST_DATA_PLUGIN', False):
self.skipTest('Testdata plugin')
instance = jsi(*args, **kwargs)
if not instance.is_available():
self.skipTest(f'{key} is not available')
return instance
with self.subTest(key):
func(self, wrapped_jsi_with_unavaliable_auto_skip)
return wrapper
return inner
class TestExternalJSI(unittest.TestCase):
_TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'testdata', 'jsi_external')
maxDiff = 2000
def setUp(self):
self.ydl = FakeYDL()
@use_jsi_rumtimes()
def test_execute(self, jsi_cls: type[JSI]):
jsi = jsi_cls(self.ydl, '', 10)
self.assertEqual(jsi.execute('console.log("Hello, world!");'), 'Hello, world!')
@use_jsi_rumtimes()
def test_user_agent(self, jsi_cls: type[JSI]):
ua = self.ydl.params['http_headers']['User-Agent']
jsi = jsi_cls(self.ydl, '', 10)
self.assertEqual(jsi.execute('console.log(navigator.userAgent);'), ua)
self.assertNotEqual(jsi.execute('console.log(JSON.stringify(navigator.webdriver));'), 'true')
jsi = jsi_cls(self.ydl, '', 10, user_agent='test/ua')
self.assertEqual(jsi.execute('console.log(navigator.userAgent);'), 'test/ua')
@use_jsi_rumtimes()
def test_location(self, jsi_cls: type[JSI]):
jsi = jsi_cls(self.ydl, 'https://example.com/123/456', 10)
self.assertEqual(jsi.execute('console.log(JSON.stringify([location.href, location.hostname]));'),
'["https://example.com/123/456","example.com"]')
@use_jsi_rumtimes(exclude=['Deno'])
def test_execute_dom_parse(self, jsi_cls: type[JSI]):
jsi = jsi_cls(self.ydl, '', 10)
self.assertEqual(jsi.execute(
'console.log(document.getElementById("test-div").innerHTML);',
html='<html><body><div id="test-div">Hello, world!</div></body></html>'),
'Hello, world!')
@use_jsi_rumtimes(exclude=['Deno'])
def test_execute_dom_script(self, jsi_cls: type[JSI]):
jsi = jsi_cls(self.ydl, '', 10)
self.assertEqual(jsi.execute(
'console.log(document.getElementById("test-div").innerHTML);',
html='''<html><head><title>Hello, world!</title><body>
<div id="test-div"></div>
<script src="https://example.com/script.js"></script>
<script type="text/javascript">
document.getElementById("test-div").innerHTML = document.title;
console.log('this should not show up');
a = b; // Errors should be ignored
</script>
</body></html>'''),
'Hello, world!')
@use_jsi_rumtimes(exclude=['Deno'])
def test_dom_location(self, jsi_cls: type[JSI]):
jsi = jsi_cls(self.ydl, 'https://example.com/123/456', 10)
self.assertEqual(jsi.execute(
'console.log(document.getElementById("test-div").innerHTML);',
html='''<html><head><script>
document.querySelector("#test-div").innerHTML = document.domain</script></head>
<body><div id="test-div">Hello, world!</div></body></html>'''),
'example.com')
@use_jsi_rumtimes(exclude=['Deno'])
def test_execute_cookiejar(self, jsi_cls: type[JSI]):
cookiejar = YoutubeDLCookieJar()
ref_cookiejar = YoutubeDLCookieJar()
def _assert_expected_execute(cookie_str, ref_cookie_str):
self.assertEqual(set(cookie_str.split('; ')), set(ref_cookie_str.split('; ')))
for cookie in cookiejar:
ref_cookie = next((c for c in ref_cookiejar if c.name == cookie.name
and c.domain == cookie.domain), None)
self.assertEqual(repr(cookie), repr(ref_cookie))
for test_cookie in [
NetscapeFields('test1', 'test1', '.example.com', '/', False, int(time.time()) + 1000),
NetscapeFields('test2', 'test2', '.example.com', '/', True, int(time.time()) + 1000),
NetscapeFields('test3', 'test3', '.example.com', '/123', False, int(time.time()) + 1000),
NetscapeFields('test4', 'test4', '.example.com', '/456', False, int(time.time()) + 1000),
NetscapeFields('test5', 'test5', '.example.com', '/123', True, int(time.time()) + 1000),
NetscapeFields('test6', 'test6', '.example.com', '/456', True, int(time.time()) + 1000),
NetscapeFields('test1', 'other1', '.other.com', '/', False, int(time.time()) + 1000),
NetscapeFields('test2', 'other2', '.other.com', '/', False, int(time.time()) + 1000),
NetscapeFields('test7', 'other7', '.other.com', '/', False, int(time.time()) + 1000),
]:
cookiejar.set_cookie(test_cookie.to_cookie())
ref_cookiejar.set_cookie(test_cookie.to_cookie())
# test identity without modification from js
jsi = jsi_cls(self.ydl, 'http://example.com/123/456', 10)
_assert_expected_execute(jsi.execute(
'console.log(document.cookie);', cookiejar=cookiejar),
'test1=test1; test3=test3')
# test modification of existing cookie from js
new_cookie_1 = NetscapeFields('test1', 'new1', '.example.com', '/', True, int(time.time()) + 900)
new_cookie_2 = NetscapeFields('test2', 'new2', '.example.com', '/', True, int(time.time()) + 900)
ref_cookiejar.set_cookie(new_cookie_1.to_cookie())
ref_cookiejar.set_cookie(new_cookie_2.to_cookie())
# change to https url to test secure-domain behavior
jsi = jsi_cls(self.ydl, 'https://example.com/123/456', 10)
_assert_expected_execute(jsi.execute(
f'''document.cookie = "test1=new1; secure; expires={new_cookie_1.expire_str()}; domain=.example.com; path=/";
console.log(document.cookie);''',
html=f'''<html><body><div id="test-div">Hello, world!</div>
<script>
document.cookie = "test2=new2; secure; expires={new_cookie_2.expire_str()}; domain=.example.com; path=/";
</script>
</body></html>''',
cookiejar=cookiejar),
'test1=new1; test2=new2; test3=test3; test5=test5')
@use_jsi_rumtimes(exclude=['PhantomJS'])
def test_wasm(self, jsi_cls: type[JSI]):
with open(os.path.join(self._TESTDATA_DIR, 'hello_wasm.js')) as f:
js_mod = f.read()
with open(os.path.join(self._TESTDATA_DIR, 'hello_wasm_bg.wasm'), 'rb') as f:
wasm = f.read()
js_base = prepare_wasm_jsmodule(js_mod, wasm)
js_code = js_base + ''';
console.log(add(1, 2));
greet('world');
'''
jsi = jsi_cls(self.ydl, '', 10)
self.assertEqual(jsi.execute(js_code), '3\nHello, world!')
if __name__ == '__main__':
unittest.main()

View File

@ -9,7 +9,7 @@
import math import math
from yt_dlp.jsinterp import JS_Undefined, JSInterpreter, js_number_to_string from yt_dlp.jsinterp.native import JS_Undefined, JSInterpreter, js_number_to_string
class NaN: class NaN:

View File

@ -22,9 +22,11 @@
from yt_dlp.globals import ( from yt_dlp.globals import (
extractors, extractors,
postprocessors, postprocessors,
jsi_runtimes,
plugin_dirs, plugin_dirs,
plugin_ies, plugin_ies,
plugin_pps, plugin_pps,
plugin_jsis,
all_plugins_loaded, all_plugins_loaded,
plugin_specs, plugin_specs,
) )
@ -44,16 +46,24 @@
plugin_destination=plugin_pps, plugin_destination=plugin_pps,
) )
JSI_PLUGIN_SPEC = PluginSpec(
module_name='jsinterp',
suffix='JSI',
destination=jsi_runtimes,
plugin_destination=plugin_jsis,
)
def reset_plugins(): def reset_plugins():
plugin_ies.value = {} plugin_ies.value = {}
plugin_pps.value = {} plugin_pps.value = {}
plugin_jsis.value = {}
plugin_dirs.value = ['default'] plugin_dirs.value = ['default']
plugin_specs.value = {} plugin_specs.value = {}
all_plugins_loaded.value = False all_plugins_loaded.value = False
# Clearing override plugins is probably difficult # Clearing override plugins is probably difficult
for module_name in tuple(sys.modules): for module_name in tuple(sys.modules):
for plugin_type in ('extractor', 'postprocessor'): for plugin_type in ('extractor', 'postprocessor', 'jsinterp'):
if module_name.startswith(f'{PACKAGE_NAME}.{plugin_type}.'): if module_name.startswith(f'{PACKAGE_NAME}.{plugin_type}.'):
del sys.modules[module_name] del sys.modules[module_name]
@ -108,6 +118,17 @@ def test_postprocessor_classes(self):
self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys())
self.assertIn('NormalPluginPP', plugin_pps.value) self.assertIn('NormalPluginPP', plugin_pps.value)
def test_jsi_runtime_classes(self):
plugins_jsi = load_plugins(JSI_PLUGIN_SPEC)
self.assertIn('NormalPluginJSI', plugins_jsi.keys())
self.assertIn(f'{PACKAGE_NAME}.jsinterp.normal', sys.modules.keys())
self.assertIn('NormalPluginJSI', plugin_jsis.value)
self.assertNotIn('OverrideDenoJSI', plugins_jsi.keys())
self.assertNotIn('OverrideDenoJSI', plugin_jsis.value)
self.assertNotIn('_UnderscoreOverrideDenoJSI', plugins_jsi.keys())
self.assertNotIn('_UnderscoreOverrideDenoJSI', plugin_jsis.value)
def test_importing_zipped_module(self): def test_importing_zipped_module(self):
zip_path = TEST_DATA_DIR / 'zipped_plugins.zip' zip_path = TEST_DATA_DIR / 'zipped_plugins.zip'
shutil.make_archive(str(zip_path)[:-4], 'zip', str(zip_path)[:-4]) shutil.make_archive(str(zip_path)[:-4], 'zip', str(zip_path)[:-4])
@ -125,6 +146,9 @@ def test_importing_zipped_module(self):
plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC) plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC)
self.assertIn('ZippedPluginPP', plugins_pp.keys()) self.assertIn('ZippedPluginPP', plugins_pp.keys())
plugins_jsi = load_plugins(JSI_PLUGIN_SPEC)
self.assertIn('ZippedPluginJSI', plugins_jsi.keys())
finally: finally:
sys.path.remove(str(zip_path)) sys.path.remove(str(zip_path))
os.remove(zip_path) os.remove(zip_path)
@ -134,13 +158,14 @@ def test_reloading_plugins(self):
reload_plugins_path = TEST_DATA_DIR / 'reload_plugins' reload_plugins_path = TEST_DATA_DIR / 'reload_plugins'
load_plugins(EXTRACTOR_PLUGIN_SPEC) load_plugins(EXTRACTOR_PLUGIN_SPEC)
load_plugins(POSTPROCESSOR_PLUGIN_SPEC) load_plugins(POSTPROCESSOR_PLUGIN_SPEC)
load_plugins(JSI_PLUGIN_SPEC)
# Remove default folder and add reload_plugin path # Remove default folder and add reload_plugin path
sys.path.remove(str(TEST_DATA_DIR)) sys.path.remove(str(TEST_DATA_DIR))
sys.path.append(str(reload_plugins_path)) sys.path.append(str(reload_plugins_path))
importlib.invalidate_caches() importlib.invalidate_caches()
try: try:
for plugin_type in ('extractor', 'postprocessor'): for plugin_type in ('extractor', 'postprocessor', 'jsinterp'):
package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}') package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}')
self.assertIn(reload_plugins_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__)) self.assertIn(reload_plugins_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__))
@ -161,6 +186,14 @@ def test_reloading_plugins(self):
postprocessors.value['NormalPluginPP'].REPLACED, postprocessors.value['NormalPluginPP'].REPLACED,
msg='Reloading has not replaced original postprocessor plugin globally') msg='Reloading has not replaced original postprocessor plugin globally')
plugins_jsi = load_plugins(JSI_PLUGIN_SPEC)
self.assertIn('NormalPluginJSI', plugins_jsi.keys())
self.assertTrue(plugins_jsi['NormalPluginJSI'].REPLACED,
msg='Reloading has not replaced original postprocessor plugin')
self.assertTrue(
jsi_runtimes.value['NormalPluginJSI'].REPLACED,
msg='Reloading has not replaced original postprocessor plugin globally')
finally: finally:
sys.path.remove(str(reload_plugins_path)) sys.path.remove(str(reload_plugins_path))
sys.path.append(str(TEST_DATA_DIR)) sys.path.append(str(TEST_DATA_DIR))
@ -181,6 +214,24 @@ def test_extractor_override_plugin(self):
from yt_dlp.extractor.generic import GenericIE from yt_dlp.extractor.generic import GenericIE
self.assertEqual(GenericIE.IE_NAME, 'generic+override+underscore-override') self.assertEqual(GenericIE.IE_NAME, 'generic+override+underscore-override')
def test_jsi_override_plugin(self):
load_plugins(JSI_PLUGIN_SPEC)
from yt_dlp.jsinterp._deno import DenoJSI
# test that jsi_runtimes is updated with override jsi
self.assertTrue(DenoJSI is jsi_runtimes.value['Deno'])
self.assertEqual(jsi_runtimes.value['Deno'].TEST_FIELD, 'override')
self.assertEqual(jsi_runtimes.value['Deno'].SECONDARY_TEST_FIELD, 'underscore-override')
self.assertEqual(jsi_runtimes.value['Deno'].JSI_NAME, 'Deno+override+underscore-override')
importlib.invalidate_caches()
# test that loading a second time doesn't wrap a second time
load_plugins(EXTRACTOR_PLUGIN_SPEC)
from yt_dlp.jsinterp._deno import DenoJSI
self.assertTrue(DenoJSI is jsi_runtimes.value['Deno'])
self.assertEqual(jsi_runtimes.value['Deno'].JSI_NAME, 'Deno+override+underscore-override')
def test_load_all_plugin_types(self): def test_load_all_plugin_types(self):
# no plugin specs registered # no plugin specs registered
@ -188,24 +239,29 @@ def test_load_all_plugin_types(self):
self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys())
self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys())
self.assertNotIn(f'{PACKAGE_NAME}.jsinterp.normal', sys.modules.keys())
register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) register_plugin_spec(EXTRACTOR_PLUGIN_SPEC)
register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC)
register_plugin_spec(JSI_PLUGIN_SPEC)
load_all_plugins() load_all_plugins()
self.assertTrue(all_plugins_loaded.value) self.assertTrue(all_plugins_loaded.value)
self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys())
self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys())
self.assertIn(f'{PACKAGE_NAME}.jsinterp.normal', sys.modules.keys())
def test_no_plugin_dirs(self): def test_no_plugin_dirs(self):
register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) register_plugin_spec(EXTRACTOR_PLUGIN_SPEC)
register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC)
register_plugin_spec(JSI_PLUGIN_SPEC)
plugin_dirs.value = [] plugin_dirs.value = []
load_all_plugins() load_all_plugins()
self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys())
self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys())
self.assertNotIn(f'{PACKAGE_NAME}.jsinterp.normal', sys.modules.keys())
def test_set_plugin_dirs(self): def test_set_plugin_dirs(self):
custom_plugin_dir = str(TEST_DATA_DIR / 'plugin_packages') custom_plugin_dir = str(TEST_DATA_DIR / 'plugin_packages')
@ -236,9 +292,11 @@ def test_append_plugin_dirs(self):
def test_get_plugin_spec(self): def test_get_plugin_spec(self):
register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) register_plugin_spec(EXTRACTOR_PLUGIN_SPEC)
register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC)
register_plugin_spec(JSI_PLUGIN_SPEC)
self.assertEqual(plugin_specs.value.get('extractor'), EXTRACTOR_PLUGIN_SPEC) self.assertEqual(plugin_specs.value.get('extractor'), EXTRACTOR_PLUGIN_SPEC)
self.assertEqual(plugin_specs.value.get('postprocessor'), POSTPROCESSOR_PLUGIN_SPEC) self.assertEqual(plugin_specs.value.get('postprocessor'), POSTPROCESSOR_PLUGIN_SPEC)
self.assertEqual(plugin_specs.value.get('jsinterp'), JSI_PLUGIN_SPEC)
self.assertIsNone(plugin_specs.value.get('invalid')) self.assertIsNone(plugin_specs.value.get('invalid'))

234
test/testdata/jsi_external/hello_wasm.js vendored Normal file
View File

@ -0,0 +1,234 @@
// wasm-pack build --target web
/* lib.rs
use wasm_bindgen::prelude::*;
#[wasm_bindgen]
extern "C" {
pub fn eval(s: &str);
}
#[wasm_bindgen]
pub fn greet(name: &str) {
eval(&format!("console.log('Hello, {}!')", name));
}
#[wasm_bindgen]
pub fn add(left: i32, right: i32) -> i32 {
left + right
}
*/
let wasm;
const cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } );
if (typeof TextDecoder !== 'undefined') { cachedTextDecoder.decode(); };
let cachedUint8ArrayMemory0 = null;
function getUint8ArrayMemory0() {
if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
}
return cachedUint8ArrayMemory0;
}
function getStringFromWasm0(ptr, len) {
ptr = ptr >>> 0;
return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
}
let WASM_VECTOR_LEN = 0;
const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } );
const encodeString = (typeof cachedTextEncoder.encodeInto === 'function'
? function (arg, view) {
return cachedTextEncoder.encodeInto(arg, view);
}
: function (arg, view) {
const buf = cachedTextEncoder.encode(arg);
view.set(buf);
return {
read: arg.length,
written: buf.length
};
});
function passStringToWasm0(arg, malloc, realloc) {
if (realloc === undefined) {
const buf = cachedTextEncoder.encode(arg);
const ptr = malloc(buf.length, 1) >>> 0;
getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
WASM_VECTOR_LEN = buf.length;
return ptr;
}
let len = arg.length;
let ptr = malloc(len, 1) >>> 0;
const mem = getUint8ArrayMemory0();
let offset = 0;
for (; offset < len; offset++) {
const code = arg.charCodeAt(offset);
if (code > 0x7F) break;
mem[ptr + offset] = code;
}
if (offset !== len) {
if (offset !== 0) {
arg = arg.slice(offset);
}
ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
const ret = encodeString(arg, view);
offset += ret.written;
ptr = realloc(ptr, len, offset, 1) >>> 0;
}
WASM_VECTOR_LEN = offset;
return ptr;
}
/**
* @param {string} name
*/
export function greet(name) {
const ptr0 = passStringToWasm0(name, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len0 = WASM_VECTOR_LEN;
wasm.greet(ptr0, len0);
}
/**
* @param {number} left
* @param {number} right
* @returns {number}
*/
export function add(left, right) {
const ret = wasm.add(left, right);
return ret;
}
async function __wbg_load(module, imports) {
if (typeof Response === 'function' && module instanceof Response) {
if (typeof WebAssembly.instantiateStreaming === 'function') {
try {
return await WebAssembly.instantiateStreaming(module, imports);
} catch (e) {
if (module.headers.get('Content-Type') != 'application/wasm') {
console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e);
} else {
throw e;
}
}
}
const bytes = await module.arrayBuffer();
return await WebAssembly.instantiate(bytes, imports);
} else {
const instance = await WebAssembly.instantiate(module, imports);
if (instance instanceof WebAssembly.Instance) {
return { instance, module };
} else {
return instance;
}
}
}
function __wbg_get_imports() {
const imports = {};
imports.wbg = {};
imports.wbg.__wbg_eval_d1c6d8ede79fdfce = function(arg0, arg1) {
eval(getStringFromWasm0(arg0, arg1));
};
imports.wbg.__wbindgen_init_externref_table = function() {
const table = wasm.__wbindgen_export_0;
const offset = table.grow(4);
table.set(0, undefined);
table.set(offset + 0, undefined);
table.set(offset + 1, null);
table.set(offset + 2, true);
table.set(offset + 3, false);
;
};
return imports;
}
function __wbg_init_memory(imports, memory) {
}
function __wbg_finalize_init(instance, module) {
wasm = instance.exports;
__wbg_init.__wbindgen_wasm_module = module;
cachedUint8ArrayMemory0 = null;
wasm.__wbindgen_start();
return wasm;
}
function initSync(module) {
if (wasm !== undefined) return wasm;
if (typeof module !== 'undefined') {
if (Object.getPrototypeOf(module) === Object.prototype) {
({module} = module)
} else {
console.warn('using deprecated parameters for `initSync()`; pass a single object instead')
}
}
const imports = __wbg_get_imports();
__wbg_init_memory(imports);
if (!(module instanceof WebAssembly.Module)) {
module = new WebAssembly.Module(module);
}
const instance = new WebAssembly.Instance(module, imports);
return __wbg_finalize_init(instance, module);
}
async function __wbg_init(module_or_path) {
if (wasm !== undefined) return wasm;
if (typeof module_or_path !== 'undefined') {
if (Object.getPrototypeOf(module_or_path) === Object.prototype) {
({module_or_path} = module_or_path)
} else {
console.warn('using deprecated parameters for the initialization function; pass a single object instead')
}
}
if (typeof module_or_path === 'undefined') {
module_or_path = new URL('hello_wasm_bg.wasm', import.meta.url);
}
const imports = __wbg_get_imports();
if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) {
module_or_path = fetch(module_or_path);
}
__wbg_init_memory(imports);
const { instance, module } = await __wbg_load(await module_or_path, imports);
return __wbg_finalize_init(instance, module);
}
export { initSync };
export default __wbg_init;

Binary file not shown.

View File

@ -0,0 +1,5 @@
from yt_dlp.jsinterp.common import JSI
class NormalPluginJSI(JSI):
REPLACED = True

View File

@ -0,0 +1,6 @@
from yt_dlp.jsinterp.common import JSI
class NormalPluginJSI(JSI):
TEST_DATA_PLUGIN = True
REPLACED = False

View File

@ -0,0 +1,5 @@
from yt_dlp.jsinterp._deno import DenoJSI
class OverrideDenoJSI(DenoJSI, plugin_name='override'):
TEST_FIELD = 'override'

View File

@ -0,0 +1,5 @@
from yt_dlp.jsinterp._deno import DenoJSI
class _UnderscoreOverrideDenoJSI(DenoJSI, plugin_name='underscore-override'):
SECONDARY_TEST_FIELD = 'underscore-override'

View File

@ -0,0 +1,5 @@
from yt_dlp.jsinterp.common import JSI
class ZippedPluginJSI(JSI):
pass

View File

@ -32,13 +32,15 @@
from .downloader.rtmp import rtmpdump_version from .downloader.rtmp import rtmpdump_version
from .extractor import gen_extractor_classes, get_info_extractor, import_extractors from .extractor import gen_extractor_classes, get_info_extractor, import_extractors
from .extractor.common import UnsupportedURLIE from .extractor.common import UnsupportedURLIE
from .extractor.openload import PhantomJSwrapper from .jsinterp import PhantomJSwrapper
from .globals import ( from .globals import (
IN_CLI, IN_CLI,
LAZY_EXTRACTORS, LAZY_EXTRACTORS,
plugin_ies, plugin_ies,
plugin_ies_overrides, plugin_ies_overrides,
plugin_pps, plugin_pps,
plugin_jsis,
plugin_jsis_overrides,
all_plugins_loaded, all_plugins_loaded,
plugin_dirs, plugin_dirs,
) )
@ -445,6 +447,8 @@ class YoutubeDL:
Actual sleep time will be a random float from range Actual sleep time will be a random float from range
[sleep_interval; max_sleep_interval]. [sleep_interval; max_sleep_interval].
sleep_interval_subtitles: Number of seconds to sleep before each subtitle download sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
jsi_preference: Preferred JS interpreters to use during extraction. Can be
given as comma-separated values.
listformats: Print an overview of available video formats and exit. listformats: Print an overview of available video formats and exit.
list_thumbnails: Print a table of all thumbnails and exit. list_thumbnails: Print a table of all thumbnails and exit.
match_filter: A function that gets called for every video with the signature match_filter: A function that gets called for every video with the signature
@ -4097,13 +4101,17 @@ def get_encoding(stream):
write_debug(f'Proxy map: {self.proxies}') write_debug(f'Proxy map: {self.proxies}')
write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}')
for plugin_type, plugins in (('Extractor', plugin_ies), ('Post-Processor', plugin_pps)): for plugin_type, plugins in (('Extractor', plugin_ies), ('Post-Processor', plugin_pps),
('JSI-Runtime', plugin_jsis)):
display_list = [ display_list = [
klass.__name__ if klass.__name__ == name else f'{klass.__name__} as {name}' klass.__name__ if klass.__name__ == name else f'{klass.__name__} as {name}'
for name, klass in plugins.value.items()] for name, klass in plugins.value.items()]
if plugin_type == 'Extractor': if plugin_type == 'Extractor':
display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})' display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})'
for parent, plugins in plugin_ies_overrides.value.items()) for parent, plugins in plugin_ies_overrides.value.items())
elif plugin_type == 'JSI-Runtime':
display_list.extend(f'{plugins[-1].JSI_NAME.partition("+")[2]} ({parent.__name__})'
for parent, plugins in plugin_jsis_overrides.value.items())
if not display_list: if not display_list:
continue continue
write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}') write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}')

View File

@ -946,6 +946,7 @@ def parse_options(argv=None):
'sleep_interval': opts.sleep_interval, 'sleep_interval': opts.sleep_interval,
'max_sleep_interval': opts.max_sleep_interval, 'max_sleep_interval': opts.max_sleep_interval,
'sleep_interval_subtitles': opts.sleep_interval_subtitles, 'sleep_interval_subtitles': opts.sleep_interval_subtitles,
'jsi_preference': opts.jsi_preference,
'external_downloader': opts.external_downloader, 'external_downloader': opts.external_downloader,
'download_ranges': opts.download_ranges, 'download_ranges': opts.download_ranges,
'force_keyframes_at_cuts': opts.force_keyframes_at_cuts, 'force_keyframes_at_cuts': opts.force_keyframes_at_cuts,

View File

@ -4,7 +4,7 @@
import uuid import uuid
from .common import InfoExtractor from .common import InfoExtractor
from .openload import PhantomJSwrapper from ..jsinterp import PhantomJSwrapper
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
UserNotLive, UserNotLive,

View File

@ -5,7 +5,7 @@
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .openload import PhantomJSwrapper from ..jsinterp import JSIWrapper
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
clean_html, clean_html,
@ -398,6 +398,27 @@ class IqIE(InfoExtractor):
IE_DESC = 'International version of iQiyi' IE_DESC = 'International version of iQiyi'
_VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w%-]*-)?(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w%-]*-)?(?P<id>\w+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.iq.com/play/sangmin-dinneaw-episode-1-xmk7546rfw',
'md5': '63fcb4b7d4863472fe0a9be75d9e9d60',
'info_dict': {
'ext': 'mp4',
'id': 'xmk7546rfw',
'title': '尚岷与丁尼奥 第1集',
'description': 'md5:e8fe4a8da25f4b8c86bc5506b1c3faaa',
'duration': 3092,
'timestamp': 1735520401,
'upload_date': '20241230',
'episode_number': 1,
'episode': 'Episode 1',
'series': 'Sangmin Dinneaw',
'age_limit': 18,
'average_rating': float,
'categories': [],
'cast': ['Sangmin Choi', 'Ratana Aiamsaart'],
},
'expected_warnings': ['format is restricted'],
'jsi_matrix': True,
}, {
'url': 'https://www.iq.com/play/one-piece-episode-1000-1ma1i6ferf4', 'url': 'https://www.iq.com/play/one-piece-episode-1000-1ma1i6ferf4',
'md5': '2d7caf6eeca8a32b407094b33b757d39', 'md5': '2d7caf6eeca8a32b407094b33b757d39',
'info_dict': { 'info_dict': {
@ -418,6 +439,7 @@ class IqIE(InfoExtractor):
'format': '500', 'format': '500',
}, },
'expected_warnings': ['format is restricted'], 'expected_warnings': ['format is restricted'],
'skip': 'geo-restricted',
}, { }, {
# VIP-restricted video # VIP-restricted video
'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4', 'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4',
@ -449,7 +471,6 @@ class IqIE(InfoExtractor):
} }
_DASH_JS = ''' _DASH_JS = '''
console.log(page.evaluate(function() {
var tvid = "%(tvid)s"; var vid = "%(vid)s"; var src = "%(src)s"; var tvid = "%(tvid)s"; var vid = "%(vid)s"; var src = "%(src)s";
var uid = "%(uid)s"; var dfp = "%(dfp)s"; var mode = "%(mode)s"; var lang = "%(lang)s"; var uid = "%(uid)s"; var dfp = "%(dfp)s"; var mode = "%(mode)s"; var lang = "%(lang)s";
var bid_list = %(bid_list)s; var ut_list = %(ut_list)s; var tm = new Date().getTime(); var bid_list = %(bid_list)s; var ut_list = %(ut_list)s; var tm = new Date().getTime();
@ -515,9 +536,7 @@ class IqIE(InfoExtractor):
var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path); var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path);
dash_paths[bid] = dash_path; dash_paths[bid] = dash_path;
}); });
return JSON.stringify(dash_paths); console.log(JSON.stringify(dash_paths));
}));
saveAndExit();
''' '''
def _extract_vms_player_js(self, webpage, video_id): def _extract_vms_player_js(self, webpage, video_id):
@ -597,10 +616,10 @@ def _real_extract(self, url):
else: else:
ut_list = ['0'] ut_list = ['0']
jsi = JSIWrapper(self, url, timeout=120)
# bid 0 as an initial format checker # bid 0 as an initial format checker
dash_paths = self._parse_json(PhantomJSwrapper(self, timeout=120_000).get( dash_paths = self._parse_json(jsi.execute(self._DASH_JS % {
url, note2='Executing signature code (this may take a couple minutes)',
html='<!DOCTYPE html>', video_id=video_id, jscode=self._DASH_JS % {
'tvid': video_info['tvId'], 'tvid': video_info['tvId'],
'vid': video_info['vid'], 'vid': video_info['vid'],
'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'),
@ -612,7 +631,7 @@ def _real_extract(self, url):
'bid_list': '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']', 'bid_list': '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']',
'ut_list': '[' + ','.join(ut_list) + ']', 'ut_list': '[' + ','.join(ut_list) + ']',
'cmd5x_func': self._extract_cmd5x_function(webpage, video_id), 'cmd5x_func': self._extract_cmd5x_function(webpage, video_id),
})[1].strip(), video_id) }, video_id, html='<!DOCTYPE html>'), video_id)
formats, subtitles = [], {} formats, subtitles = [], {}
initial_format_data = self._download_json( initial_format_data = self._download_json(

View File

@ -1,243 +0,0 @@
import collections
import contextlib
import json
import os
import subprocess
import tempfile
import urllib.parse
from ..utils import (
ExtractorError,
Popen,
check_executable,
format_field,
get_exe_version,
is_outdated_version,
shell_quote,
)
def cookie_to_dict(cookie):
cookie_dict = {
'name': cookie.name,
'value': cookie.value,
}
if cookie.port_specified:
cookie_dict['port'] = cookie.port
if cookie.domain_specified:
cookie_dict['domain'] = cookie.domain
if cookie.path_specified:
cookie_dict['path'] = cookie.path
if cookie.expires is not None:
cookie_dict['expires'] = cookie.expires
if cookie.secure is not None:
cookie_dict['secure'] = cookie.secure
if cookie.discard is not None:
cookie_dict['discard'] = cookie.discard
with contextlib.suppress(TypeError):
if (cookie.has_nonstandard_attr('httpOnly')
or cookie.has_nonstandard_attr('httponly')
or cookie.has_nonstandard_attr('HttpOnly')):
cookie_dict['httponly'] = True
return cookie_dict
def cookie_jar_to_list(cookie_jar):
return [cookie_to_dict(cookie) for cookie in cookie_jar]
class PhantomJSwrapper:
"""PhantomJS wrapper class
This class is experimental.
"""
INSTALL_HINT = 'Please download it from https://phantomjs.org/download.html'
_BASE_JS = R'''
phantom.onError = function(msg, trace) {{
var msgStack = ['PHANTOM ERROR: ' + msg];
if(trace && trace.length) {{
msgStack.push('TRACE:');
trace.forEach(function(t) {{
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
+ (t.function ? ' (in function ' + t.function +')' : ''));
}});
}}
console.error(msgStack.join('\n'));
phantom.exit(1);
}};
'''
_TEMPLATE = R'''
var page = require('webpage').create();
var fs = require('fs');
var read = {{ mode: 'r', charset: 'utf-8' }};
var write = {{ mode: 'w', charset: 'utf-8' }};
JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{
phantom.addCookie(x);
}});
page.settings.resourceTimeout = {timeout};
page.settings.userAgent = "{ua}";
page.onLoadStarted = function() {{
page.evaluate(function() {{
delete window._phantom;
delete window.callPhantom;
}});
}};
var saveAndExit = function() {{
fs.write("{html}", page.content, write);
fs.write("{cookies}", JSON.stringify(phantom.cookies), write);
phantom.exit();
}};
page.onLoadFinished = function(status) {{
if(page.url === "") {{
page.setContent(fs.read("{html}", read), "{url}");
}}
else {{
{jscode}
}}
}};
page.open("");
'''
_TMP_FILE_NAMES = ['script', 'html', 'cookies']
@staticmethod
def _version():
return get_exe_version('phantomjs', version_re=r'([0-9.]+)')
def __init__(self, extractor, required_version=None, timeout=10000):
self._TMP_FILES = {}
self.exe = check_executable('phantomjs', ['-v'])
if not self.exe:
raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True)
self.extractor = extractor
if required_version:
version = self._version()
if is_outdated_version(version, required_version):
self.extractor._downloader.report_warning(
'Your copy of PhantomJS is outdated, update it to version '
f'{required_version} or newer if you encounter any errors.')
for name in self._TMP_FILE_NAMES:
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.close()
self._TMP_FILES[name] = tmp
self.options = collections.ChainMap({
'timeout': timeout,
}, {
x: self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"')
for x in self._TMP_FILE_NAMES
})
def __del__(self):
for name in self._TMP_FILE_NAMES:
with contextlib.suppress(OSError, KeyError):
os.remove(self._TMP_FILES[name].name)
def _save_cookies(self, url):
cookies = cookie_jar_to_list(self.extractor.cookiejar)
for cookie in cookies:
if 'path' not in cookie:
cookie['path'] = '/'
if 'domain' not in cookie:
cookie['domain'] = urllib.parse.urlparse(url).netloc
with open(self._TMP_FILES['cookies'].name, 'wb') as f:
f.write(json.dumps(cookies).encode())
def _load_cookies(self):
with open(self._TMP_FILES['cookies'].name, 'rb') as f:
cookies = json.loads(f.read().decode('utf-8'))
for cookie in cookies:
if cookie['httponly'] is True:
cookie['rest'] = {'httpOnly': None}
if 'expiry' in cookie:
cookie['expire_time'] = cookie['expiry']
self.extractor._set_cookie(**cookie)
def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
"""
Downloads webpage (if needed) and executes JS
Params:
url: website url
html: optional, html code of website
video_id: video id
note: optional, displayed when downloading webpage
note2: optional, displayed when executing JS
headers: custom http headers
jscode: code to be executed when page is loaded
Returns tuple with:
* downloaded website (after JS execution)
* anything you print with `console.log` (but not inside `page.execute`!)
In most cases you don't need to add any `jscode`.
It is executed in `page.onLoadFinished`.
`saveAndExit();` is mandatory, use it instead of `phantom.exit()`
It is possible to wait for some element on the webpage, e.g.
var check = function() {
var elementFound = page.evaluate(function() {
return document.querySelector('#b.done') !== null;
});
if(elementFound)
saveAndExit();
else
window.setTimeout(check, 500);
}
page.evaluate(function(){
document.querySelector('#a').click();
});
check();
"""
if 'saveAndExit();' not in jscode:
raise ExtractorError('`saveAndExit();` not found in `jscode`')
if not html:
html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
with open(self._TMP_FILES['html'].name, 'wb') as f:
f.write(html.encode())
self._save_cookies(url)
user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent']
jscode = self._TEMPLATE.format_map(self.options.new_child({
'url': url,
'ua': user_agent.replace('"', '\\"'),
'jscode': jscode,
}))
stdout = self.execute(jscode, video_id, note=note2)
with open(self._TMP_FILES['html'].name, 'rb') as f:
html = f.read().decode('utf-8')
self._load_cookies()
return html, stdout
def execute(self, jscode, video_id=None, *, note='Executing JS'):
"""Execute JS and return stdout"""
if 'phantom.exit();' not in jscode:
jscode += ';\nphantom.exit();'
jscode = self._BASE_JS + jscode
with open(self._TMP_FILES['script'].name, 'w', encoding='utf-8') as f:
f.write(jscode)
self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}')
cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name]
self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}')
try:
stdout, stderr, returncode = Popen.run(cmd, timeout=self.options['timeout'] / 1000,
text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception as e:
raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e)
if returncode:
raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}')
return stdout

View File

@ -5,7 +5,7 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .openload import PhantomJSwrapper from ..jsinterp import PhantomJSwrapper
from ..networking import Request from ..networking import Request
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (

View File

@ -6,7 +6,7 @@
from .common import InfoExtractor from .common import InfoExtractor
from .periscope import PeriscopeBaseIE, PeriscopeIE from .periscope import PeriscopeBaseIE, PeriscopeIE
from ..jsinterp import js_number_to_string from ..jsinterp.native import js_number_to_string
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,

View File

@ -25,8 +25,7 @@
) )
from .pot._director import initialize_pot_director from .pot._director import initialize_pot_director
from .pot.provider import PoTokenContext, PoTokenRequest from .pot.provider import PoTokenContext, PoTokenRequest
from ..openload import PhantomJSwrapper from ...jsinterp import JSInterpreter, PhantomJSwrapper
from ...jsinterp import JSInterpreter
from ...networking.exceptions import HTTPError from ...networking.exceptions import HTTPError
from ...utils import ( from ...utils import (
NO_DEFAULT, NO_DEFAULT,

View File

@ -15,6 +15,7 @@ def __repr__(self, /):
postprocessors = Indirect({}) postprocessors = Indirect({})
extractors = Indirect({}) extractors = Indirect({})
jsi_runtimes = Indirect({})
# Plugins # Plugins
all_plugins_loaded = Indirect(False) all_plugins_loaded = Indirect(False)
@ -23,7 +24,9 @@ def __repr__(self, /):
plugin_ies = Indirect({}) plugin_ies = Indirect({})
plugin_pps = Indirect({}) plugin_pps = Indirect({})
plugin_jsis = Indirect({})
plugin_ies_overrides = Indirect(defaultdict(list)) plugin_ies_overrides = Indirect(defaultdict(list))
plugin_jsis_overrides = Indirect(defaultdict(list))
# Misc # Misc
IN_CLI = Indirect(False) IN_CLI = Indirect(False)

View File

@ -0,0 +1,27 @@
# flake8: noqa: F401
from .native import JSInterpreter
from .common import _JSI_PREFERENCES, JSIWrapper
from ._phantomjs import PhantomJSJSI, PhantomJSwrapper
from ._deno import DenoJSI, DenoJSDomJSI
from ..globals import jsi_runtimes, plugin_jsis
from ..plugins import PluginSpec, register_plugin_spec
jsi_runtimes.value.update({
name: value
for name, value in globals().items()
if name.endswith('JSI')
})
register_plugin_spec(PluginSpec(
module_name='jsinterp',
suffix='JSI',
destination=jsi_runtimes,
plugin_destination=plugin_jsis,
))
__all__ = [
JSInterpreter,
PhantomJSwrapper,
_JSI_PREFERENCES,
JSIWrapper,
]

204
yt_dlp/jsinterp/_deno.py Normal file
View File

@ -0,0 +1,204 @@
from __future__ import annotations
import http.cookiejar
import json
import platform
import re
import subprocess
import typing
import urllib.parse
from ..utils import (
ExtractorError,
Popen,
int_or_none,
shell_quote,
unified_timestamp,
)
from ._helper import TempFileWrapper, random_string, override_navigator_js, extract_script_tags
from .common import ExternalJSI
class DenoJSI(ExternalJSI):
"""JS interpreter class using Deno binary"""
_BASE_PREFERENCE = 5
_EXE_NAME = 'deno'
_DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check']
_INIT_SCRIPT = 'localStorage.clear(); delete window.Deno; global = window = globalThis;\n'
def __init__(self, *args, flags=[], replace_flags=False, init_script=None, **kwargs):
super().__init__(*args, **kwargs)
self._flags = flags if replace_flags else [*self._DENO_FLAGS, *flags]
self._init_script = self._INIT_SCRIPT if init_script is None else init_script
@property
def _override_navigator_js(self):
return override_navigator_js(self.user_agent)
def _run_deno(self, cmd):
self.write_debug(f'Deno command line: {shell_quote(cmd)}')
try:
stdout, stderr, returncode = Popen.run(
cmd, timeout=self.timeout, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception as e:
raise ExtractorError('Unable to run Deno binary', cause=e)
if returncode:
raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}')
elif stderr:
self.report_warning(f'JS console error msg:\n{stderr.strip()}')
return stdout.strip()
def execute(self, jscode, video_id=None, note='Executing JS in Deno'):
self.report_note(video_id, note)
location_args = ['--location', self._url] if self._url else []
with TempFileWrapper(f'{self._init_script};\n{self._override_navigator_js}\n{jscode}', suffix='.js') as js_file:
cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name]
return self._run_deno(cmd)
class DenoJSDomJSI(DenoJSI):
_BASE_PREFERENCE = 4
_DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check']
_JSDOM_VERSION = None
_JSDOM_URL = 'https://esm.sh/v135/jsdom' # force use esm v135, see esm-dev/esm.sh #1034
@staticmethod
def serialize_cookie(cookiejar: YoutubeDLCookieJar | None, url: str):
"""serialize netscape-compatible fields from cookiejar for tough-cookie loading"""
# JSDOM use tough-cookie as its CookieJar https://github.com/jsdom/jsdom/blob/main/lib/api.js
# tough-cookie use Cookie.fromJSON and Cookie.toJSON for cookie serialization
# https://github.com/salesforce/tough-cookie/blob/master/lib/cookie/cookie.ts
if not cookiejar:
return json.dumps({'cookies': []})
cookies: list[http.cookiejar.Cookie] = list(cookiejar.get_cookies_for_url(url))
return json.dumps({'cookies': [{
'key': cookie.name,
'value': cookie.value,
# leading dot of domain must be removed, otherwise will fail to match
'domain': cookie.domain.lstrip('.') or urllib.parse.urlparse(url).hostname,
'expires': int_or_none(cookie.expires, invscale=1000),
'hostOnly': not cookie.domain_initial_dot,
'secure': bool(cookie.secure),
'path': cookie.path,
} for cookie in cookies if cookie.value]})
@staticmethod
def apply_cookies(cookiejar: YoutubeDLCookieJar | None, cookies: list[dict]):
"""apply cookies from serialized tough-cookie"""
# see serialize_cookie
if not cookiejar:
return
for cookie_dict in cookies:
if not all(cookie_dict.get(k) for k in ('key', 'value', 'domain')):
continue
if cookie_dict.get('hostOnly'):
cookie_dict['domain'] = cookie_dict['domain'].lstrip('.')
else:
cookie_dict['domain'] = '.' + cookie_dict['domain'].lstrip('.')
cookiejar.set_cookie(http.cookiejar.Cookie(
0, cookie_dict['key'], cookie_dict['value'],
None, False,
cookie_dict['domain'], True, not cookie_dict.get('hostOnly'),
cookie_dict.get('path', '/'), True,
bool(cookie_dict.get('secure')),
unified_timestamp(cookie_dict.get('expires')),
False, None, None, {}))
def _ensure_jsdom(self):
if self._JSDOM_VERSION:
return
# `--allow-import` is unsupported in v1, and esm.sh:443 is default allowed remote host for v2
result = self._run_deno([self.exe, 'info', self._JSDOM_URL])
version_line = next((line for line in result.splitlines() if self._JSDOM_URL in line), '')
if m := re.search(r'@([\d\.]+)', version_line):
self._JSDOM_VERSION = m[1]
def report_version(self):
super().report_version()
self._ensure_jsdom()
self.write_debug(f'JSDOM lib version {self._JSDOM_VERSION}')
def execute(self, jscode, video_id=None, note='Executing JS in Deno with jsdom', html='', cookiejar=None):
self.report_note(video_id, note)
self._ensure_jsdom()
if cookiejar and not self._url:
self.report_warning('No valid url scope provided, cookiejar is not applied')
cookiejar = None
html, inline_scripts = extract_script_tags(html)
wrapper_scripts = '\n'.join(['try { %s } catch (e) {}' % script for script in inline_scripts])
callback_varname = f'__callback_{random_string()}'
script = f'''{self._init_script};
import jsdom from "{self._JSDOM_URL}";
let {callback_varname} = (() => {{
const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, self._url))});
const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{
{'url: %s,' % json.dumps(str(self._url)) if self._url else ''}
cookieJar: jar,
pretendToBeVisual: true,
}});
Object.keys(dom.window).filter(key => !['atob', 'btoa', 'crypto', 'location'].includes(key))
.filter(key => !(window.location? [] : ['sessionStorage', 'localStorage']).includes(key))
.forEach((key) => {{
try {{window[key] = dom.window[key]}} catch (e) {{ console.error(e) }}
}});
{self._override_navigator_js};
window.screen = {{
availWidth: 1920,
availHeight: 1040,
width: 1920,
height: 1080,
colorDepth: 24,
isExtended: true,
onchange: null,
orientation: {{angle: 0, type: 'landscape-primary', onchange: null}},
pixelDepth: 24,
}}
Object.defineProperty(document.body, 'clientWidth', {{value: 1903}});
Object.defineProperty(document.body, 'clientHeight', {{value: 2000}});
document.domain = location?.hostname;
delete window.jsdom;
const origLog = console.log;
console.log = () => {{}};
console.info = () => {{}};
return () => {{
const stdout = [];
console.log = (...msg) => stdout.push(msg.map(m => '' + m).join(' '));
return () => {{ origLog(JSON.stringify({{
stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }}
}}
}})();
{wrapper_scripts}
{callback_varname} = {callback_varname}(); // begin to capture console.log
try {{
{jscode}
}} finally {{
{callback_varname}();
}}
'''
# https://github.com/prebuild/node-gyp-build/blob/6822ec5/node-gyp-build.js#L196-L198
# This jsdom dependency raises fatal error on linux unless read for this file is allowed
read_flag = ['--allow-read=/etc/alpine-release'] if platform.system() == 'Linux' else []
location_args = ['--location', self._url] if self._url else []
with TempFileWrapper(script, suffix='.js') as js_file:
cmd = [self.exe, 'run', *self._flags, *read_flag, *location_args, js_file.name]
result = self._run_deno(cmd)
try:
data = json.loads(result)
except json.JSONDecodeError as e:
raise ExtractorError(f'Failed to parse JSON output from Deno: {result}', cause=e)
self.apply_cookies(cookiejar, data['cookies'])
return data['stdout']
if typing.TYPE_CHECKING:
from ..cookies import YoutubeDLCookieJar

135
yt_dlp/jsinterp/_helper.py Normal file
View File

@ -0,0 +1,135 @@
from __future__ import annotations
import contextlib
import json
import os
import random
import re
import string
import tempfile
class TempFileWrapper:
"""
Wrapper for NamedTemporaryFile, auto closes file after io and deletes file upon wrapper object gc
@param {str | bytes | None} content: content to write to file upon creation
@param {bool} text: whether to open file in text mode
@param {str} encoding: encoding to use for text mode
@param {str | None} suffix: suffix for filename of temporary file
"""
def __init__(self, content: str | bytes | None = None, text: bool = True,
encoding='utf-8', suffix: str | None = None):
self.encoding = None if not text else encoding
self.text = text
self._file = tempfile.NamedTemporaryFile('w' if text else 'wb', encoding=self.encoding,
suffix=suffix, delete=False)
if content:
self._file.write(content)
self._file.close()
@property
def name(self):
return self._file.name
@contextlib.contextmanager
def opened_file(self, mode, *, seek=None, seek_whence=0):
mode = mode if (self.text or 'b' in mode) else mode + 'b'
with open(self._file.name, mode, encoding=self.encoding) as f:
if seek is not None:
self._file.seek(seek, seek_whence)
yield f
def write(self, s, seek=None, seek_whence=0):
"""re-open file in write mode and write, optionally seek to position first"""
with self.opened_file('w', seek=seek, seek_whence=seek_whence) as f:
return f.write(s)
def append_write(self, s, seek=None, seek_whence=0):
"""re-open file in append mode and write, optionally seek to position first"""
with self.opened_file('a', seek=seek, seek_whence=seek_whence) as f:
return f.write(s)
def read(self, n=-1, seek=None, seek_whence=0):
"""re-open file and read, optionally seek to position first"""
with self.opened_file('r', seek=seek, seek_whence=seek_whence) as f:
return f.read(n)
def cleanup(self):
with contextlib.suppress(OSError):
os.remove(self._file.name)
def __del__(self):
self.cleanup()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.cleanup()
def random_string(length: int = 10) -> str:
return ''.join(random.choices(string.ascii_letters, k=length))
def override_navigator_js(user_agent: str) -> str:
"""Generate js snippet to override navigator properties based on user_agent string"""
return '\n'.join([
'Object.defineProperty(navigator, "%s", { value: %s, configurable: true });' % (k, json.dumps(v))
for k, v in {
'userAgent': user_agent,
'language': 'en-US',
'languages': ['en-US'],
'webdriver': False,
'cookieEnabled': True,
'appCodeName': user_agent.split('/', maxsplit=1)[0],
'appName': 'Netscape',
'appVersion': user_agent.split('/', maxsplit=1)[-1],
'platform': 'Win32',
'product': 'Gecko',
'productSub': '20030107',
'vendor': 'Google Inc.',
'vendorSub': '',
'onLine': True,
}.items()
])
def extract_script_tags(html: str) -> tuple[str, list[str]]:
script_indicies = []
inline_scripts = []
for match_start in re.finditer(r'<script[^>]*>', html, re.DOTALL | re.IGNORECASE):
end = html.find('</script>', match_start.end())
if end > match_start.end():
script_indicies.append((match_start.start(), end + len('</script>')))
inline_scripts.append(html[match_start.end():end])
for start, end in script_indicies:
html = html[:start] + html[end:]
return html, inline_scripts
def prepare_wasm_jsmodule(js_mod: str, wasm: bytes) -> str:
"""
Sanitize js wrapper module generated by rust wasm-pack for wasm init
Removes export and import.meta, and inlines wasm binary as Uint8Array
See test/test_data/jsi_external/hello_wasm.js for example
@param {str} js_mod: js wrapper module generated by rust wasm-pack
@param {bytes} wasm: wasm binary
"""
js_mod = re.sub(r'export(?:\s+default)?([\s{])', r'\1', js_mod)
js_mod = js_mod.replace('import.meta', '{}')
return js_mod + ''';
await (async () => {
const t = __wbg_get_imports();
__wbg_init_memory(t);
const {module, instance} = await WebAssembly.instantiate(Uint8Array.from(%s), t);
__wbg_finalize_init(instance, module);
})();
''' % list(wasm)

View File

@ -0,0 +1,262 @@
from __future__ import annotations
import contextlib
import http.cookiejar
import json
import subprocess
import typing
import urllib.parse
from ..utils import (
ExtractorError,
Popen,
filter_dict,
int_or_none,
is_outdated_version,
shell_quote,
)
from ._helper import TempFileWrapper, random_string, extract_script_tags
from .common import ExternalJSI
class PhantomJSJSI(ExternalJSI):
_EXE_NAME = 'phantomjs'
_BASE_PREFERENCE = 3
_BASE_JS = R'''
phantom.onError = function(msg, trace) {{
var msgStack = ['PHANTOM ERROR: ' + msg];
if(trace && trace.length) {{
msgStack.push('TRACE:');
trace.forEach(function(t) {{
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
+ (t.function ? ' (in function ' + t.function +')' : ''));
}});
}}
console.error(msgStack.join('\n'));
phantom.exit(1);
}};
'''
_TEMPLATE = R'''
var page = require('webpage').create();
var fs = require('fs');
var read = {{ mode: 'r', charset: 'utf-8' }};
var write = {{ mode: 'w', charset: 'utf-8' }};
page.settings.resourceTimeout = {timeout};
page.settings.userAgent = {ua};
page.onLoadStarted = function() {{
page.evaluate(function() {{
delete window._phantom;
delete window.callPhantom;
}});
}};
var saveAndExit = function() {{
fs.write({html_fn}, page.content, write);
fs.write({cookies_fn}, JSON.stringify(phantom.cookies), write);
phantom.exit();
}};
var loaded = false;
page.onLoadFinished = function(status) {{
if(page.url === "" && !loaded) {{
page.setContent(fs.read({html_fn}, read), {url});
loaded = true;
}}
else {{
JSON.parse(fs.read({cookies_fn}, read)).forEach(function(x) {{
phantom.addCookie(x);
}});
{jscode}
}}
}};
page.open("");
'''
def _save_cookies(self, url, cookiejar: YoutubeDLCookieJar | None):
def _cookie_to_dict(cookie: http.cookiejar.Cookie):
cookie_dict = {
'name': cookie.name,
'value': cookie.value,
'port': cookie.port,
'domain': cookie.domain,
'path': cookie.path or '/',
'expires': int_or_none(cookie.expires, invscale=1000),
'secure': cookie.secure,
'discard': cookie.discard,
}
if not cookie_dict['domain']:
cookie_dict['domain'] = urllib.parse.urlparse(url).hostname
cookie_dict['port'] = urllib.parse.urlparse(url).port
with contextlib.suppress(TypeError):
if (cookie.has_nonstandard_attr('httpOnly')
or cookie.has_nonstandard_attr('httponly')
or cookie.has_nonstandard_attr('HttpOnly')):
cookie_dict['httponly'] = True
return filter_dict(cookie_dict)
cookies = cookiejar.get_cookies_for_url(url) if cookiejar else []
return json.dumps([_cookie_to_dict(cookie) for cookie in cookies])
def _load_cookies(self, cookies_json: str, cookiejar: YoutubeDLCookieJar | None):
if not cookiejar:
return
cookies = json.loads(cookies_json)
for cookie in cookies:
cookiejar.set_cookie(http.cookiejar.Cookie(
0, cookie['name'], cookie['value'], cookie.get('port'), cookie.get('port') is not None,
cookie['domain'], True, cookie['domain'].startswith('.'),
cookie.get('path', '/'), True,
cookie.get('secure', False), cookie.get('expiry'),
cookie.get('discard', False), None, None,
{'httpOnly': None} if cookie.get('httponly') is True else {},
))
def _execute(self, jscode: str, video_id=None, *, note='Executing JS in PhantomJS'):
"""Execute JS and return stdout"""
if 'phantom.exit();' not in jscode:
jscode += ';\nphantom.exit();'
jscode = self._BASE_JS + jscode
self.report_note(video_id, note)
with TempFileWrapper(jscode, suffix='.js') as js_file:
cmd = [self.exe, '--ssl-protocol=any', js_file.name]
self.write_debug(f'PhantomJS command line: {shell_quote(cmd)}')
try:
stdout, stderr, returncode = Popen.run(
cmd, timeout=self.timeout, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception as e:
raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e)
if returncode:
raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}')
return stdout
def _execute_html(self, jscode: str, url: str, html: str, cookiejar, video_id=None, note='Executing JS on webpage'):
if 'saveAndExit();' not in jscode:
raise ExtractorError('`saveAndExit();` not found in `jscode`')
if cookiejar and not url:
self.report_warning('No valid url scope provided, cookiejar is not applied')
cookiejar = None
html, inline_scripts = extract_script_tags(html)
wrapped_scripts = '\n'.join([
'page.evaluate(function() { try { %s } catch (e) {} });' % inline for inline in inline_scripts])
html_file = TempFileWrapper(html, suffix='.html')
cookie_file = TempFileWrapper(self._save_cookies(url, cookiejar), suffix='.json')
script = self._TEMPLATE.format_map({
'url': json.dumps(str(url)),
'ua': json.dumps(str(self.user_agent)),
'jscode': f'{wrapped_scripts}\n{jscode}',
'html_fn': json.dumps(html_file.name),
'cookies_fn': json.dumps(cookie_file.name),
'timeout': int(self.timeout * 1000),
})
stdout = self._execute(script, video_id, note=note)
self._load_cookies(cookie_file.read(), cookiejar)
new_html = html_file.read()
return new_html, stdout
def execute(self, jscode, video_id=None, note='Executing JS in PhantomJS', html='', cookiejar=None):
jscode = '''console.log(page.evaluate(function() {
var %(std_var)s = [];
console.log = function() {
var values = '';
for (var i = 0; i < arguments.length; i++) {
values += arguments[i] + ' ';
}
%(std_var)s.push(values);
}
%(jscode)s;
return %(std_var)s.join('\\n');
}));
saveAndExit();''' % {
'std_var': f'__stdout__values_{random_string()}',
'jscode': jscode,
}
return self._execute_html(jscode, self._url, html, cookiejar, video_id=video_id, note=note)[1].strip()
class PhantomJSwrapper:
"""PhantomJS wrapper class
This class is experimental.
"""
INSTALL_HINT = 'Please download PhantomJS from https://phantomjs.org/download.html'
@classmethod
def _version(cls):
return PhantomJSJSI.exe_version
def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000):
self._jsi = PhantomJSJSI(extractor._downloader, '', timeout / 1000, {})
if not self._jsi.is_available():
raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True)
self.extractor = extractor
if required_version:
if is_outdated_version(self._jsi.exe_version, required_version):
self._jsi.report_warning(
'Your copy of PhantomJS is outdated, update it to version '
f'{required_version} or newer if you encounter any errors.')
def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
"""
Downloads webpage (if needed) and executes JS
Params:
url: website url
html: optional, html code of website
video_id: video id
note: optional, displayed when downloading webpage
note2: optional, displayed when executing JS
headers: custom http headers
jscode: code to be executed when page is loaded
Returns tuple with:
* downloaded website (after JS execution)
* anything you print with `console.log` (but not inside `page.execute`!)
In most cases you don't need to add any `jscode`.
It is executed in `page.onLoadFinished`.
`saveAndExit();` is mandatory, use it instead of `phantom.exit()`
It is possible to wait for some element on the webpage, e.g.
var check = function() {
var elementFound = page.evaluate(function() {
return document.querySelector('#b.done') !== null;
});
if(elementFound)
saveAndExit();
else
window.setTimeout(check, 500);
}
page.evaluate(function(){
document.querySelector('#a').click();
});
check();
"""
if 'saveAndExit();' not in jscode:
raise ExtractorError('`saveAndExit();` not found in `jscode`')
if not html:
html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
self._jsi.user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent']
return self._jsi._execute_html(jscode, url, html, self.extractor.cookiejar, video_id=video_id, note=note2)
def execute(self, jscode, video_id=None, *, note='Executing JS in PhantomJS'):
"""Execute JS and return stdout"""
return self._jsi.execute(jscode, video_id=video_id, note=note)
if typing.TYPE_CHECKING:
from ..extractor.common import InfoExtractor
from ..cookies import YoutubeDLCookieJar

315
yt_dlp/jsinterp/common.py Normal file
View File

@ -0,0 +1,315 @@
from __future__ import annotations
import abc
import inspect
import sys
import typing
from ..globals import jsi_runtimes, plugin_jsis_overrides
from ..extractor.common import InfoExtractor
from ..utils import (
classproperty,
format_field,
filter_dict,
get_exe_version,
url_or_none,
sanitize_url,
ExtractorError,
)
_JSI_PREFERENCES: set[JSIPreference] = set()
def get_all_handlers() -> dict[str, type[JSI]]:
return {jsi.JSI_KEY: jsi for jsi in jsi_runtimes.value.values()}
def to_jsi_keys(jsi_or_keys: typing.Iterable[str | type[JSI] | JSI]) -> list[str]:
return [jok if isinstance(jok, str) else jok.JSI_KEY for jok in jsi_or_keys]
def get_included_jsi(only_include=None, exclude=None):
return {
key: value for key, value in get_all_handlers().items()
if (not only_include or key in to_jsi_keys(only_include))
and (not exclude or key not in to_jsi_keys(exclude))
}
def order_to_pref(jsi_order: typing.Iterable[str | type[JSI] | JSI], multiplier: int) -> JSIPreference:
"""convert a list of jsi keys into a preference function"""
jsi_order = reversed(to_jsi_keys(jsi_order))
pref_score = {jsi_cls: (i + 1) * multiplier for i, jsi_cls in enumerate(jsi_order)}
def _pref(jsi: JSI, *args):
return pref_score.get(jsi.JSI_KEY, 0)
return _pref
class JSIWrapper:
"""
Helper class to forward JS interp request to a JSI that supports it.
Usage:
```
def _real_extract(self, url):
...
jsi = JSIWrapper(self, url)
result = jsi.execute(jscode, video_id)
...
```
@param dl_or_ie: `YoutubeDL` or `InfoExtractor` instance.
@param url: setting url context
@param only_include: limit JSI to choose from.
@param exclude: JSI to avoid using.
@param jsi_params: extra kwargs to pass to `JSI.__init__()` for each JSI, using jsi key as dict key.
@param preferred_order: list of JSI to try before others. First in list is tried first.
@param timeout: timeout parameter for all chosen JSI
@param user_agent: specify user-agent to use, default to downloader UA
"""
def __init__(
self,
dl_or_ie: YoutubeDL | InfoExtractor,
url: str = '',
only_include: typing.Iterable[str | type[JSI]] = [],
exclude: typing.Iterable[str | type[JSI]] = [],
jsi_params: dict[str, dict] = {},
preferred_order: typing.Iterable[str | type[JSI]] = [],
timeout: float | int = 10,
user_agent: str | None = None,
):
if isinstance(dl_or_ie, InfoExtractor):
self._downloader = dl_or_ie._downloader
self._ie_key = dl_or_ie.ie_key()
else:
self._downloader = dl_or_ie
self._ie_key = None
self._url = self._sanitize_url(url)
self.preferences: set[JSIPreference] = {
order_to_pref(self._load_jsi_keys_from_option('jsi_preference'), 10000),
order_to_pref(preferred_order, 100),
} | _JSI_PREFERENCES
handler_classes = self._load_allowed_jsi_cls(only_include, exclude)
if not handler_classes:
raise ExtractorError('No JSI is allowed to use')
user_agent = user_agent or self._downloader.params['http_headers']['User-Agent']
self._handler_dict = {cls.JSI_KEY: cls(
self._downloader, url=self._url, timeout=timeout,
user_agent=user_agent, **jsi_params.get(cls.JSI_KEY, {}),
) for cls in handler_classes.values()}
self._is_test = self._downloader.params.get('test', False)
def _sanitize_url(self, url):
sanitized = sanitize_url(url_or_none(url)) or ''
if url and not sanitized:
self.report_warning(f'Invalid URL: "{url}", using empty string instead')
return sanitized
def _load_jsi_keys_from_option(self, option_key):
jsi_keys = self._downloader.params.get(option_key, [])
valid_handlers = list(get_all_handlers())
for invalid_key in [key for key in jsi_keys if key not in valid_handlers]:
self.report_warning(f'{option_key}: `{invalid_key}` is not a valid JSI', only_once=True)
jsi_keys.remove(invalid_key)
return jsi_keys
def _load_allowed_jsi_cls(self, only_include, exclude):
self.write_debug(f'Loaded JSI runtimes: {get_all_handlers()}')
handler_classes = filter_dict(
get_included_jsi(only_include, exclude),
lambda _, v: v.supports_extractor(self._ie_key))
self.write_debug(f'Select JSI {"for " + self._ie_key if self._ie_key else ""}: {to_jsi_keys(handler_classes)}, '
f'included: {to_jsi_keys(only_include) or "all"}, excluded: {to_jsi_keys(exclude)}')
return handler_classes
def write_debug(self, message, only_once=False):
return self._downloader.write_debug(f'[JSIDirector] {message}', only_once=only_once)
def report_warning(self, message, only_once=False):
return self._downloader.report_warning(f'[JSIDirector] {message}', only_once=only_once)
def _get_handlers(self, method_name: str, *args, **kwargs) -> list[JSI]:
def _supports_method_with_params(jsi: JSI):
if not callable(method := getattr(jsi, method_name, None)):
return False
method_params = inspect.signature(method).parameters
return all(key in method_params for key in kwargs)
handlers = [h for h in self._handler_dict.values() if _supports_method_with_params(h)]
self.write_debug(f'Choosing handlers for method `{method_name}` with kwargs {list(kwargs)}'
f': {to_jsi_keys(handlers)}')
if not handlers:
raise ExtractorError(f'No JSI supports method `{method_name}` with kwargs {list(kwargs)}, '
f'included handlers: {to_jsi_keys(self._handler_dict.values())}')
preferences = {
handler.JSI_KEY: sum(pref_func(handler, method_name, args, kwargs) for pref_func in self.preferences)
for handler in handlers
}
self.write_debug('JSI preferences for `{}` request: {}'.format(
method_name, ', '.join(f'{key}={pref}' for key, pref in preferences.items())))
return sorted(handlers, key=lambda h: preferences[h.JSI_KEY], reverse=True)
def _dispatch_request(self, method_name: str, *args, **kwargs):
handlers = self._get_handlers(method_name, *args, **kwargs)
unavailable: list[str] = []
exceptions: list[tuple[JSI, Exception]] = []
for handler in handlers:
if not handler.is_available():
if self._is_test:
raise ExtractorError(f'{handler.JSI_NAME} is not available for testing, '
f'add "{handler.JSI_KEY}" in `exclude` if it should not be used')
self.write_debug(f'{handler.JSI_KEY} is not available')
unavailable.append(handler.JSI_NAME)
continue
try:
self.write_debug(f'Dispatching `{method_name}` task to {handler.JSI_NAME}')
handler.report_version()
return getattr(handler, method_name)(*args, **kwargs)
except ExtractorError as e:
if self._is_test:
raise ExtractorError(f'{handler.JSI_NAME} got error while evaluating js, '
f'add "{handler.JSI_KEY}" in `exclude` if it should not be used')
exceptions.append((handler, e))
self.write_debug(f'{handler.JSI_NAME} encountered error, fallback to next handler: {e}')
if not exceptions:
msg = f'No available JSI installed, please install one of: {", ".join(unavailable)}'
else:
msg = f'Failed to perform {method_name}, total {len(exceptions)} errors'
if unavailable:
msg = f'{msg}. You may try installing one of unavailable JSI: {", ".join(unavailable)}'
raise ExtractorError(msg)
def execute(self, jscode: str, video_id: str | None, note: str | None = None,
html: str | None = None, cookiejar: YoutubeDLCookieJar | None = None) -> str:
"""
Execute JS code and return stdout from console.log
@param jscode: JS code to execute
@param video_id
@param note
@param html: html to load as document
@param cookiejar: cookiejar to read and set cookies, pass `InfoExtractor.cookiejar` if you want to read and write cookies
"""
return self._dispatch_request('execute', jscode, video_id, **filter_dict({
'note': note, 'html': html, 'cookiejar': cookiejar}))
class JSI(abc.ABC):
_BASE_PREFERENCE: int = 0
def __init__(self, downloader: YoutubeDL, url: str, timeout: float | int, user_agent=None):
self._downloader = downloader
self._url = url
self.timeout = timeout
self.user_agent: str = user_agent or self._downloader.params['http_headers']['User-Agent']
@classmethod
def __init_subclass__(cls, *, plugin_name=None, **kwargs):
if plugin_name:
mro = inspect.getmro(cls)
next_mro_class = super_class = mro[mro.index(cls) + 1]
while getattr(super_class, '__wrapped__', None):
super_class = super_class.__wrapped__
if not any(override.PLUGIN_NAME == plugin_name for override in plugin_jsis_overrides.value[super_class]):
cls.__wrapped__ = next_mro_class
cls.PLUGIN_NAME, cls.JSI_KEY = plugin_name, next_mro_class.JSI_KEY
cls.JSI_NAME = f'{next_mro_class.JSI_NAME}+{plugin_name}'
setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
# additional update jsi_runtime because jsis are not further loaded like extractors
jsi_runtimes.value[super_class.JSI_KEY] = cls
plugin_jsis_overrides.value[super_class].append(cls)
return super().__init_subclass__(**kwargs)
@abc.abstractmethod
def is_available(self) -> bool:
raise NotImplementedError
def write_debug(self, msg, *args, **kwargs):
self._downloader.write_debug(f'[{self.JSI_NAME}] {msg}', *args, **kwargs)
def report_warning(self, msg, *args, **kwargs):
self._downloader.report_warning(f'[{self.JSI_NAME}] {msg}', *args, **kwargs)
def to_screen(self, msg, *args, **kwargs):
self._downloader.to_screen(f'[{self.JSI_NAME}] {msg}', *args, **kwargs)
def report_note(self, video_id, note):
self.to_screen(f'{format_field(video_id, None, "%s: ")}{note}')
def report_version(self):
return
@classmethod
def supports_extractor(cls, ie_key: str):
return True
@classproperty
def JSI_NAME(cls) -> str:
return cls.__name__[:-3]
@classproperty
def JSI_KEY(cls) -> str:
assert cls.__name__.endswith('JSI'), 'JSI class names must end with "JSI"'
return cls.__name__[:-3]
class ExternalJSI(JSI, abc.ABC):
_EXE_NAME: str
@classproperty(cache=True)
def exe_version(cls):
return get_exe_version(cls._EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)')
@classproperty
def exe(cls):
return cls._EXE_NAME if cls.exe_version else None
@classmethod
def is_available(cls):
return bool(cls.exe)
def report_version(self):
self.write_debug(f'{self._EXE_NAME} version {self.exe_version}')
def register_jsi_preference(*handlers: type[JSI]):
assert all(issubclass(handler, JSI) for handler in handlers), f'{handlers} must all be a subclass of JSI'
def outer(pref_func: JSIPreference) -> JSIPreference:
def inner(handler: JSI, *args):
if not handlers or isinstance(handler, handlers):
return pref_func(handler, *args)
return 0
_JSI_PREFERENCES.add(inner)
return inner
return outer
@register_jsi_preference()
def _base_preference(handler: JSI, *args):
return min(10, getattr(handler, '_BASE_PREFERENCE', 0))
if typing.TYPE_CHECKING:
from ..YoutubeDL import YoutubeDL
from ..cookies import YoutubeDLCookieJar
class JSIPreference(typing.Protocol):
def __call__(self, handler: JSI, method_name: str, *args, **kwargs) -> int:
...

View File

@ -6,7 +6,7 @@
import operator import operator
import re import re
from .utils import ( from ..utils import (
NO_DEFAULT, NO_DEFAULT,
ExtractorError, ExtractorError,
function_with_repr, function_with_repr,

View File

@ -1192,6 +1192,11 @@ def _preset_alias_callback(option, opt_str, value, parser):
'--sleep-subtitles', metavar='SECONDS', '--sleep-subtitles', metavar='SECONDS',
dest='sleep_interval_subtitles', default=0, type=int, dest='sleep_interval_subtitles', default=0, type=int,
help='Number of seconds to sleep before each subtitle download') help='Number of seconds to sleep before each subtitle download')
workarounds.add_option(
'--jsi-preference',
metavar='JSI', dest='jsi_preference', default=[], type='str', action='callback',
callback=_list_from_options_callback,
help='Preferred JS interpreters to use during extraction. Can be given as comma-separated values.')
verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options') verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options')
verbosity.add_option( verbosity.add_option(