diff --git a/README.md b/README.md index 0f9a7d556..ce1a09313 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,7 @@ ### Metadata ### Misc * [**pycryptodomex**](https://github.com/Legrandin/pycryptodome)\* - For decrypting AES-128 HLS streams and various other data. Licensed under [BSD-2-Clause](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst) -* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in extractors where javascript needs to be run. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) +* [**phantomjs**](https://github.com/ariya/phantomjs), [**deno**](https://github.com/denoland/deno/) - Used in extractors where javascript needs to be run. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) and [MIT](https://github.com/xattr/xattr/blob/master/LICENSE.txt) respectively * [**secretstorage**](https://github.com/mitya57/secretstorage)\* - For `--cookies-from-browser` to access the **Gnome** keyring while decrypting cookies of **Chromium**-based browsers on **Linux**. Licensed under [BSD-3-Clause](https://github.com/mitya57/secretstorage/blob/master/LICENSE) * Any external downloader that you want to use with `--downloader` @@ -798,6 +798,9 @@ ## Workarounds: be used along with --min-sleep-interval --sleep-subtitles SECONDS Number of seconds to sleep before each subtitle download + --jsi-preference JSI Preferred JS interpreters to use during + extraction. Can be given as comma-separated + values ## Video Format Options: -f, --format FORMAT Video format code, see "FORMAT SELECTION" diff --git a/test/test_download.py b/test/test_download.py index 3f36869d9..8bc5658ef 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -25,12 +25,14 @@ import yt_dlp.YoutubeDL # isort: split from yt_dlp.extractor import get_info_extractor +from yt_dlp.jsinterp.common import get_included_jsi from yt_dlp.networking.exceptions import HTTPError, TransportError from yt_dlp.utils import ( DownloadError, ExtractorError, UnavailableVideoError, YoutubeDLError, + filter_dict, format_bytes, join_nonempty, ) @@ -82,6 +84,29 @@ def __str__(self): # Dynamically generate tests def generator(test_case, tname): + + # setting `jsi_matrix` to True, or `jsi_matrix_only_include`, `jsi_matrix_exclude` to non-empty list + # to trigger matrix behavior for JSI + if any(test_case.get(key) for key in [ + 'jsi_matrix', 'jsi_matrix_only_include', 'jsi_matrix_exclude', + ]): + jsi_keys = list(get_included_jsi(only_include=test_case.get('jsi_matrix_only_include'), + exclude=test_case.get('jsi_matrix_exclude'))) + + # use jsi_preference here, instead of force blocking other jsi runtimes + # exclusion, if needed, should be specified in test case to optimize testing + def generate_jsi_sub_case(jsi_key): + sub_case = filter_dict(test_case, lambda k, _: not k.startswith('jsi_matrix')) + sub_case['params'] = {**test_case.get('params', {}), 'jsi_preference': [jsi_key]} + return generator(sub_case, f'{tname}_{jsi_key}') + + def run_sub_cases(self): + for i, jsi_key in enumerate(jsi_keys): + with self.subTest(jsi_key): + print(f'Running case {tname} using JSI: {jsi_key} ({i + 1}/{len(jsi_keys)})') + generate_jsi_sub_case(jsi_key)(self) + return run_sub_cases + def test_template(self): if self.COMPLETED_TESTS.get(tname): return diff --git a/test/test_jsi_external.py b/test/test_jsi_external.py new file mode 100644 index 000000000..1a7793e48 --- /dev/null +++ b/test/test_jsi_external.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 + +from __future__ import annotations +import os +import dataclasses +import datetime +import time +import sys +import unittest +import http.cookiejar +import functools +import typing + + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from test.helper import FakeYDL +from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.jsinterp.common import get_included_jsi +from yt_dlp.jsinterp._helper import prepare_wasm_jsmodule + +if typing.TYPE_CHECKING: + from yt_dlp.jsinterp.common import JSI + + +@dataclasses.dataclass +class NetscapeFields: + name: str + value: str + domain: str + path: str + secure: bool + expires: int | None + + def to_cookie(self): + return http.cookiejar.Cookie( + 0, self.name, self.value, + None, False, + self.domain, True, self.domain.startswith('.'), + self.path, True, + self.secure, self.expires, False, + None, None, {}, + ) + + def expire_str(self): + return datetime.datetime.fromtimestamp( + self.expires, datetime.timezone.utc).strftime('%a, %d %b %Y %H:%M:%S GMT') + + def __eq__(self, other: NetscapeFields | http.cookiejar.Cookie): + return all(getattr(self, attr) == getattr(other, attr) for attr in ['name', 'value', 'domain', 'path', 'secure', 'expires']) + + +def use_jsi_rumtimes(exclude=[]): + def inner(func: typing.Callable[[unittest.TestCase, type[JSI]], None]): + @functools.wraps(func) + def wrapper(self: unittest.TestCase): + for key, jsi in get_included_jsi(exclude=exclude).items(): + def wrapped_jsi_with_unavaliable_auto_skip(*args, **kwargs): + if getattr(jsi, 'TEST_DATA_PLUGIN', False): + self.skipTest('Testdata plugin') + instance = jsi(*args, **kwargs) + if not instance.is_available(): + self.skipTest(f'{key} is not available') + return instance + + with self.subTest(key): + func(self, wrapped_jsi_with_unavaliable_auto_skip) + return wrapper + return inner + + +class TestExternalJSI(unittest.TestCase): + _TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'testdata', 'jsi_external') + maxDiff = 2000 + + def setUp(self): + self.ydl = FakeYDL() + + @use_jsi_rumtimes() + def test_execute(self, jsi_cls: type[JSI]): + jsi = jsi_cls(self.ydl, '', 10) + self.assertEqual(jsi.execute('console.log("Hello, world!");'), 'Hello, world!') + + @use_jsi_rumtimes() + def test_user_agent(self, jsi_cls: type[JSI]): + ua = self.ydl.params['http_headers']['User-Agent'] + + jsi = jsi_cls(self.ydl, '', 10) + self.assertEqual(jsi.execute('console.log(navigator.userAgent);'), ua) + self.assertNotEqual(jsi.execute('console.log(JSON.stringify(navigator.webdriver));'), 'true') + + jsi = jsi_cls(self.ydl, '', 10, user_agent='test/ua') + self.assertEqual(jsi.execute('console.log(navigator.userAgent);'), 'test/ua') + + @use_jsi_rumtimes() + def test_location(self, jsi_cls: type[JSI]): + jsi = jsi_cls(self.ydl, 'https://example.com/123/456', 10) + self.assertEqual(jsi.execute('console.log(JSON.stringify([location.href, location.hostname]));'), + '["https://example.com/123/456","example.com"]') + + @use_jsi_rumtimes(exclude=['Deno']) + def test_execute_dom_parse(self, jsi_cls: type[JSI]): + jsi = jsi_cls(self.ydl, '', 10) + self.assertEqual(jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + html='
Hello, world!
'), + 'Hello, world!') + + @use_jsi_rumtimes(exclude=['Deno']) + def test_execute_dom_script(self, jsi_cls: type[JSI]): + jsi = jsi_cls(self.ydl, '', 10) + self.assertEqual(jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + html='''Hello, world! +
+ + + '''), + 'Hello, world!') + + @use_jsi_rumtimes(exclude=['Deno']) + def test_dom_location(self, jsi_cls: type[JSI]): + jsi = jsi_cls(self.ydl, 'https://example.com/123/456', 10) + self.assertEqual(jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + html=''' +
Hello, world!
'''), + 'example.com') + + @use_jsi_rumtimes(exclude=['Deno']) + def test_execute_cookiejar(self, jsi_cls: type[JSI]): + cookiejar = YoutubeDLCookieJar() + ref_cookiejar = YoutubeDLCookieJar() + + def _assert_expected_execute(cookie_str, ref_cookie_str): + self.assertEqual(set(cookie_str.split('; ')), set(ref_cookie_str.split('; '))) + for cookie in cookiejar: + ref_cookie = next((c for c in ref_cookiejar if c.name == cookie.name + and c.domain == cookie.domain), None) + self.assertEqual(repr(cookie), repr(ref_cookie)) + + for test_cookie in [ + NetscapeFields('test1', 'test1', '.example.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test2', 'test2', '.example.com', '/', True, int(time.time()) + 1000), + NetscapeFields('test3', 'test3', '.example.com', '/123', False, int(time.time()) + 1000), + NetscapeFields('test4', 'test4', '.example.com', '/456', False, int(time.time()) + 1000), + NetscapeFields('test5', 'test5', '.example.com', '/123', True, int(time.time()) + 1000), + NetscapeFields('test6', 'test6', '.example.com', '/456', True, int(time.time()) + 1000), + NetscapeFields('test1', 'other1', '.other.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test2', 'other2', '.other.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test7', 'other7', '.other.com', '/', False, int(time.time()) + 1000), + ]: + cookiejar.set_cookie(test_cookie.to_cookie()) + ref_cookiejar.set_cookie(test_cookie.to_cookie()) + + # test identity without modification from js + jsi = jsi_cls(self.ydl, 'http://example.com/123/456', 10) + _assert_expected_execute(jsi.execute( + 'console.log(document.cookie);', cookiejar=cookiejar), + 'test1=test1; test3=test3') + + # test modification of existing cookie from js + new_cookie_1 = NetscapeFields('test1', 'new1', '.example.com', '/', True, int(time.time()) + 900) + new_cookie_2 = NetscapeFields('test2', 'new2', '.example.com', '/', True, int(time.time()) + 900) + ref_cookiejar.set_cookie(new_cookie_1.to_cookie()) + ref_cookiejar.set_cookie(new_cookie_2.to_cookie()) + + # change to https url to test secure-domain behavior + jsi = jsi_cls(self.ydl, 'https://example.com/123/456', 10) + _assert_expected_execute(jsi.execute( + f'''document.cookie = "test1=new1; secure; expires={new_cookie_1.expire_str()}; domain=.example.com; path=/"; + console.log(document.cookie);''', + html=f'''
Hello, world!
+ + ''', + cookiejar=cookiejar), + 'test1=new1; test2=new2; test3=test3; test5=test5') + + @use_jsi_rumtimes(exclude=['PhantomJS']) + def test_wasm(self, jsi_cls: type[JSI]): + with open(os.path.join(self._TESTDATA_DIR, 'hello_wasm.js')) as f: + js_mod = f.read() + with open(os.path.join(self._TESTDATA_DIR, 'hello_wasm_bg.wasm'), 'rb') as f: + wasm = f.read() + + js_base = prepare_wasm_jsmodule(js_mod, wasm) + + js_code = js_base + '''; + console.log(add(1, 2)); + greet('world'); + ''' + + jsi = jsi_cls(self.ydl, '', 10) + self.assertEqual(jsi.execute(js_code), '3\nHello, world!') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 2e3cdc2a5..ef3c68170 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -9,7 +9,7 @@ import math -from yt_dlp.jsinterp import JS_Undefined, JSInterpreter, js_number_to_string +from yt_dlp.jsinterp.native import JS_Undefined, JSInterpreter, js_number_to_string class NaN: diff --git a/test/test_plugins.py b/test/test_plugins.py index 195726b18..8299bfaff 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -22,9 +22,11 @@ from yt_dlp.globals import ( extractors, postprocessors, + jsi_runtimes, plugin_dirs, plugin_ies, plugin_pps, + plugin_jsis, all_plugins_loaded, plugin_specs, ) @@ -44,16 +46,24 @@ plugin_destination=plugin_pps, ) +JSI_PLUGIN_SPEC = PluginSpec( + module_name='jsinterp', + suffix='JSI', + destination=jsi_runtimes, + plugin_destination=plugin_jsis, +) + def reset_plugins(): plugin_ies.value = {} plugin_pps.value = {} + plugin_jsis.value = {} plugin_dirs.value = ['default'] plugin_specs.value = {} all_plugins_loaded.value = False # Clearing override plugins is probably difficult for module_name in tuple(sys.modules): - for plugin_type in ('extractor', 'postprocessor'): + for plugin_type in ('extractor', 'postprocessor', 'jsinterp'): if module_name.startswith(f'{PACKAGE_NAME}.{plugin_type}.'): del sys.modules[module_name] @@ -108,6 +118,17 @@ def test_postprocessor_classes(self): self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) self.assertIn('NormalPluginPP', plugin_pps.value) + def test_jsi_runtime_classes(self): + plugins_jsi = load_plugins(JSI_PLUGIN_SPEC) + self.assertIn('NormalPluginJSI', plugins_jsi.keys()) + self.assertIn(f'{PACKAGE_NAME}.jsinterp.normal', sys.modules.keys()) + self.assertIn('NormalPluginJSI', plugin_jsis.value) + + self.assertNotIn('OverrideDenoJSI', plugins_jsi.keys()) + self.assertNotIn('OverrideDenoJSI', plugin_jsis.value) + self.assertNotIn('_UnderscoreOverrideDenoJSI', plugins_jsi.keys()) + self.assertNotIn('_UnderscoreOverrideDenoJSI', plugin_jsis.value) + def test_importing_zipped_module(self): zip_path = TEST_DATA_DIR / 'zipped_plugins.zip' shutil.make_archive(str(zip_path)[:-4], 'zip', str(zip_path)[:-4]) @@ -125,6 +146,9 @@ def test_importing_zipped_module(self): plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC) self.assertIn('ZippedPluginPP', plugins_pp.keys()) + plugins_jsi = load_plugins(JSI_PLUGIN_SPEC) + self.assertIn('ZippedPluginJSI', plugins_jsi.keys()) + finally: sys.path.remove(str(zip_path)) os.remove(zip_path) @@ -134,13 +158,14 @@ def test_reloading_plugins(self): reload_plugins_path = TEST_DATA_DIR / 'reload_plugins' load_plugins(EXTRACTOR_PLUGIN_SPEC) load_plugins(POSTPROCESSOR_PLUGIN_SPEC) + load_plugins(JSI_PLUGIN_SPEC) # Remove default folder and add reload_plugin path sys.path.remove(str(TEST_DATA_DIR)) sys.path.append(str(reload_plugins_path)) importlib.invalidate_caches() try: - for plugin_type in ('extractor', 'postprocessor'): + for plugin_type in ('extractor', 'postprocessor', 'jsinterp'): package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}') self.assertIn(reload_plugins_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__)) @@ -161,6 +186,14 @@ def test_reloading_plugins(self): postprocessors.value['NormalPluginPP'].REPLACED, msg='Reloading has not replaced original postprocessor plugin globally') + plugins_jsi = load_plugins(JSI_PLUGIN_SPEC) + self.assertIn('NormalPluginJSI', plugins_jsi.keys()) + self.assertTrue(plugins_jsi['NormalPluginJSI'].REPLACED, + msg='Reloading has not replaced original postprocessor plugin') + self.assertTrue( + jsi_runtimes.value['NormalPluginJSI'].REPLACED, + msg='Reloading has not replaced original postprocessor plugin globally') + finally: sys.path.remove(str(reload_plugins_path)) sys.path.append(str(TEST_DATA_DIR)) @@ -181,6 +214,24 @@ def test_extractor_override_plugin(self): from yt_dlp.extractor.generic import GenericIE self.assertEqual(GenericIE.IE_NAME, 'generic+override+underscore-override') + def test_jsi_override_plugin(self): + load_plugins(JSI_PLUGIN_SPEC) + + from yt_dlp.jsinterp._deno import DenoJSI + + # test that jsi_runtimes is updated with override jsi + self.assertTrue(DenoJSI is jsi_runtimes.value['Deno']) + self.assertEqual(jsi_runtimes.value['Deno'].TEST_FIELD, 'override') + self.assertEqual(jsi_runtimes.value['Deno'].SECONDARY_TEST_FIELD, 'underscore-override') + + self.assertEqual(jsi_runtimes.value['Deno'].JSI_NAME, 'Deno+override+underscore-override') + importlib.invalidate_caches() + # test that loading a second time doesn't wrap a second time + load_plugins(EXTRACTOR_PLUGIN_SPEC) + from yt_dlp.jsinterp._deno import DenoJSI + self.assertTrue(DenoJSI is jsi_runtimes.value['Deno']) + self.assertEqual(jsi_runtimes.value['Deno'].JSI_NAME, 'Deno+override+underscore-override') + def test_load_all_plugin_types(self): # no plugin specs registered @@ -188,24 +239,29 @@ def test_load_all_plugin_types(self): self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + self.assertNotIn(f'{PACKAGE_NAME}.jsinterp.normal', sys.modules.keys()) register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + register_plugin_spec(JSI_PLUGIN_SPEC) load_all_plugins() self.assertTrue(all_plugins_loaded.value) self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + self.assertIn(f'{PACKAGE_NAME}.jsinterp.normal', sys.modules.keys()) def test_no_plugin_dirs(self): register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + register_plugin_spec(JSI_PLUGIN_SPEC) plugin_dirs.value = [] load_all_plugins() self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + self.assertNotIn(f'{PACKAGE_NAME}.jsinterp.normal', sys.modules.keys()) def test_set_plugin_dirs(self): custom_plugin_dir = str(TEST_DATA_DIR / 'plugin_packages') @@ -236,9 +292,11 @@ def test_append_plugin_dirs(self): def test_get_plugin_spec(self): register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + register_plugin_spec(JSI_PLUGIN_SPEC) self.assertEqual(plugin_specs.value.get('extractor'), EXTRACTOR_PLUGIN_SPEC) self.assertEqual(plugin_specs.value.get('postprocessor'), POSTPROCESSOR_PLUGIN_SPEC) + self.assertEqual(plugin_specs.value.get('jsinterp'), JSI_PLUGIN_SPEC) self.assertIsNone(plugin_specs.value.get('invalid')) diff --git a/test/testdata/jsi_external/hello_wasm.js b/test/testdata/jsi_external/hello_wasm.js new file mode 100644 index 000000000..1a3a31c46 --- /dev/null +++ b/test/testdata/jsi_external/hello_wasm.js @@ -0,0 +1,234 @@ +// wasm-pack build --target web +/* lib.rs +use wasm_bindgen::prelude::*; + +#[wasm_bindgen] +extern "C" { + pub fn eval(s: &str); +} + +#[wasm_bindgen] +pub fn greet(name: &str) { + eval(&format!("console.log('Hello, {}!')", name)); +} + +#[wasm_bindgen] +pub fn add(left: i32, right: i32) -> i32 { + left + right +} +*/ + +let wasm; + +const cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } ); + +if (typeof TextDecoder !== 'undefined') { cachedTextDecoder.decode(); }; + +let cachedUint8ArrayMemory0 = null; + +function getUint8ArrayMemory0() { + if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) { + cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer); + } + return cachedUint8ArrayMemory0; +} + +function getStringFromWasm0(ptr, len) { + ptr = ptr >>> 0; + return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len)); +} + +let WASM_VECTOR_LEN = 0; + +const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } ); + +const encodeString = (typeof cachedTextEncoder.encodeInto === 'function' + ? function (arg, view) { + return cachedTextEncoder.encodeInto(arg, view); +} + : function (arg, view) { + const buf = cachedTextEncoder.encode(arg); + view.set(buf); + return { + read: arg.length, + written: buf.length + }; +}); + +function passStringToWasm0(arg, malloc, realloc) { + + if (realloc === undefined) { + const buf = cachedTextEncoder.encode(arg); + const ptr = malloc(buf.length, 1) >>> 0; + getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf); + WASM_VECTOR_LEN = buf.length; + return ptr; + } + + let len = arg.length; + let ptr = malloc(len, 1) >>> 0; + + const mem = getUint8ArrayMemory0(); + + let offset = 0; + + for (; offset < len; offset++) { + const code = arg.charCodeAt(offset); + if (code > 0x7F) break; + mem[ptr + offset] = code; + } + + if (offset !== len) { + if (offset !== 0) { + arg = arg.slice(offset); + } + ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0; + const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len); + const ret = encodeString(arg, view); + + offset += ret.written; + ptr = realloc(ptr, len, offset, 1) >>> 0; + } + + WASM_VECTOR_LEN = offset; + return ptr; +} +/** + * @param {string} name + */ +export function greet(name) { + const ptr0 = passStringToWasm0(name, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + wasm.greet(ptr0, len0); +} + +/** + * @param {number} left + * @param {number} right + * @returns {number} + */ +export function add(left, right) { + const ret = wasm.add(left, right); + return ret; +} + +async function __wbg_load(module, imports) { + if (typeof Response === 'function' && module instanceof Response) { + if (typeof WebAssembly.instantiateStreaming === 'function') { + try { + return await WebAssembly.instantiateStreaming(module, imports); + + } catch (e) { + if (module.headers.get('Content-Type') != 'application/wasm') { + console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e); + + } else { + throw e; + } + } + } + + const bytes = await module.arrayBuffer(); + return await WebAssembly.instantiate(bytes, imports); + + } else { + const instance = await WebAssembly.instantiate(module, imports); + + if (instance instanceof WebAssembly.Instance) { + return { instance, module }; + + } else { + return instance; + } + } +} + +function __wbg_get_imports() { + const imports = {}; + imports.wbg = {}; + imports.wbg.__wbg_eval_d1c6d8ede79fdfce = function(arg0, arg1) { + eval(getStringFromWasm0(arg0, arg1)); + }; + imports.wbg.__wbindgen_init_externref_table = function() { + const table = wasm.__wbindgen_export_0; + const offset = table.grow(4); + table.set(0, undefined); + table.set(offset + 0, undefined); + table.set(offset + 1, null); + table.set(offset + 2, true); + table.set(offset + 3, false); + ; + }; + + return imports; +} + +function __wbg_init_memory(imports, memory) { + +} + +function __wbg_finalize_init(instance, module) { + wasm = instance.exports; + __wbg_init.__wbindgen_wasm_module = module; + cachedUint8ArrayMemory0 = null; + + + wasm.__wbindgen_start(); + return wasm; +} + +function initSync(module) { + if (wasm !== undefined) return wasm; + + + if (typeof module !== 'undefined') { + if (Object.getPrototypeOf(module) === Object.prototype) { + ({module} = module) + } else { + console.warn('using deprecated parameters for `initSync()`; pass a single object instead') + } + } + + const imports = __wbg_get_imports(); + + __wbg_init_memory(imports); + + if (!(module instanceof WebAssembly.Module)) { + module = new WebAssembly.Module(module); + } + + const instance = new WebAssembly.Instance(module, imports); + + return __wbg_finalize_init(instance, module); +} + +async function __wbg_init(module_or_path) { + if (wasm !== undefined) return wasm; + + + if (typeof module_or_path !== 'undefined') { + if (Object.getPrototypeOf(module_or_path) === Object.prototype) { + ({module_or_path} = module_or_path) + } else { + console.warn('using deprecated parameters for the initialization function; pass a single object instead') + } + } + + if (typeof module_or_path === 'undefined') { + module_or_path = new URL('hello_wasm_bg.wasm', import.meta.url); + } + const imports = __wbg_get_imports(); + + if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) { + module_or_path = fetch(module_or_path); + } + + __wbg_init_memory(imports); + + const { instance, module } = await __wbg_load(await module_or_path, imports); + + return __wbg_finalize_init(instance, module); +} + +export { initSync }; +export default __wbg_init; diff --git a/test/testdata/jsi_external/hello_wasm_bg.wasm b/test/testdata/jsi_external/hello_wasm_bg.wasm new file mode 100644 index 000000000..d8f32c44c Binary files /dev/null and b/test/testdata/jsi_external/hello_wasm_bg.wasm differ diff --git a/test/testdata/reload_plugins/yt_dlp_plugins/jsinterp/normal.py b/test/testdata/reload_plugins/yt_dlp_plugins/jsinterp/normal.py new file mode 100644 index 000000000..936555830 --- /dev/null +++ b/test/testdata/reload_plugins/yt_dlp_plugins/jsinterp/normal.py @@ -0,0 +1,5 @@ +from yt_dlp.jsinterp.common import JSI + + +class NormalPluginJSI(JSI): + REPLACED = True diff --git a/test/testdata/yt_dlp_plugins/jsinterp/normal.py b/test/testdata/yt_dlp_plugins/jsinterp/normal.py new file mode 100644 index 000000000..329f1a8df --- /dev/null +++ b/test/testdata/yt_dlp_plugins/jsinterp/normal.py @@ -0,0 +1,6 @@ +from yt_dlp.jsinterp.common import JSI + + +class NormalPluginJSI(JSI): + TEST_DATA_PLUGIN = True + REPLACED = False diff --git a/test/testdata/yt_dlp_plugins/jsinterp/override.py b/test/testdata/yt_dlp_plugins/jsinterp/override.py new file mode 100644 index 000000000..a55836427 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/jsinterp/override.py @@ -0,0 +1,5 @@ +from yt_dlp.jsinterp._deno import DenoJSI + + +class OverrideDenoJSI(DenoJSI, plugin_name='override'): + TEST_FIELD = 'override' diff --git a/test/testdata/yt_dlp_plugins/jsinterp/overridetwo.py b/test/testdata/yt_dlp_plugins/jsinterp/overridetwo.py new file mode 100644 index 000000000..63e6a721d --- /dev/null +++ b/test/testdata/yt_dlp_plugins/jsinterp/overridetwo.py @@ -0,0 +1,5 @@ +from yt_dlp.jsinterp._deno import DenoJSI + + +class _UnderscoreOverrideDenoJSI(DenoJSI, plugin_name='underscore-override'): + SECONDARY_TEST_FIELD = 'underscore-override' diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/jsinterp/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/jsinterp/zipped.py new file mode 100644 index 000000000..cb081c33e --- /dev/null +++ b/test/testdata/zipped_plugins/yt_dlp_plugins/jsinterp/zipped.py @@ -0,0 +1,5 @@ +from yt_dlp.jsinterp.common import JSI + + +class ZippedPluginJSI(JSI): + pass diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 309489672..33c1c46d7 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -32,13 +32,15 @@ from .downloader.rtmp import rtmpdump_version from .extractor import gen_extractor_classes, get_info_extractor, import_extractors from .extractor.common import UnsupportedURLIE -from .extractor.openload import PhantomJSwrapper +from .jsinterp import PhantomJSwrapper from .globals import ( IN_CLI, LAZY_EXTRACTORS, plugin_ies, plugin_ies_overrides, plugin_pps, + plugin_jsis, + plugin_jsis_overrides, all_plugins_loaded, plugin_dirs, ) @@ -445,6 +447,8 @@ class YoutubeDL: Actual sleep time will be a random float from range [sleep_interval; max_sleep_interval]. sleep_interval_subtitles: Number of seconds to sleep before each subtitle download + jsi_preference: Preferred JS interpreters to use during extraction. Can be + given as comma-separated values. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called for every video with the signature @@ -4097,13 +4101,17 @@ def get_encoding(stream): write_debug(f'Proxy map: {self.proxies}') write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') - for plugin_type, plugins in (('Extractor', plugin_ies), ('Post-Processor', plugin_pps)): + for plugin_type, plugins in (('Extractor', plugin_ies), ('Post-Processor', plugin_pps), + ('JSI-Runtime', plugin_jsis)): display_list = [ klass.__name__ if klass.__name__ == name else f'{klass.__name__} as {name}' for name, klass in plugins.value.items()] if plugin_type == 'Extractor': display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})' for parent, plugins in plugin_ies_overrides.value.items()) + elif plugin_type == 'JSI-Runtime': + display_list.extend(f'{plugins[-1].JSI_NAME.partition("+")[2]} ({parent.__name__})' + for parent, plugins in plugin_jsis_overrides.value.items()) if not display_list: continue write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}') diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 714d9ad5c..2459409ab 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -946,6 +946,7 @@ def parse_options(argv=None): 'sleep_interval': opts.sleep_interval, 'max_sleep_interval': opts.max_sleep_interval, 'sleep_interval_subtitles': opts.sleep_interval_subtitles, + 'jsi_preference': opts.jsi_preference, 'external_downloader': opts.external_downloader, 'download_ranges': opts.download_ranges, 'force_keyframes_at_cuts': opts.force_keyframes_at_cuts, diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index 68ace240c..f2abde781 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -4,7 +4,7 @@ import uuid from .common import InfoExtractor -from .openload import PhantomJSwrapper +from ..jsinterp import PhantomJSwrapper from ..utils import ( ExtractorError, UserNotLive, diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index 735b44637..81b18e3e4 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -5,7 +5,7 @@ import urllib.parse from .common import InfoExtractor -from .openload import PhantomJSwrapper +from ..jsinterp import JSIWrapper from ..utils import ( ExtractorError, clean_html, @@ -398,6 +398,27 @@ class IqIE(InfoExtractor): IE_DESC = 'International version of iQiyi' _VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w%-]*-)?(?P\w+)' _TESTS = [{ + 'url': 'https://www.iq.com/play/sangmin-dinneaw-episode-1-xmk7546rfw', + 'md5': '63fcb4b7d4863472fe0a9be75d9e9d60', + 'info_dict': { + 'ext': 'mp4', + 'id': 'xmk7546rfw', + 'title': '尚岷与丁尼奥 第1集', + 'description': 'md5:e8fe4a8da25f4b8c86bc5506b1c3faaa', + 'duration': 3092, + 'timestamp': 1735520401, + 'upload_date': '20241230', + 'episode_number': 1, + 'episode': 'Episode 1', + 'series': 'Sangmin Dinneaw', + 'age_limit': 18, + 'average_rating': float, + 'categories': [], + 'cast': ['Sangmin Choi', 'Ratana Aiamsaart'], + }, + 'expected_warnings': ['format is restricted'], + 'jsi_matrix': True, + }, { 'url': 'https://www.iq.com/play/one-piece-episode-1000-1ma1i6ferf4', 'md5': '2d7caf6eeca8a32b407094b33b757d39', 'info_dict': { @@ -418,6 +439,7 @@ class IqIE(InfoExtractor): 'format': '500', }, 'expected_warnings': ['format is restricted'], + 'skip': 'geo-restricted', }, { # VIP-restricted video 'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4', @@ -449,7 +471,6 @@ class IqIE(InfoExtractor): } _DASH_JS = ''' - console.log(page.evaluate(function() { var tvid = "%(tvid)s"; var vid = "%(vid)s"; var src = "%(src)s"; var uid = "%(uid)s"; var dfp = "%(dfp)s"; var mode = "%(mode)s"; var lang = "%(lang)s"; var bid_list = %(bid_list)s; var ut_list = %(ut_list)s; var tm = new Date().getTime(); @@ -515,9 +536,7 @@ class IqIE(InfoExtractor): var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path); dash_paths[bid] = dash_path; }); - return JSON.stringify(dash_paths); - })); - saveAndExit(); + console.log(JSON.stringify(dash_paths)); ''' def _extract_vms_player_js(self, webpage, video_id): @@ -597,22 +616,22 @@ def _real_extract(self, url): else: ut_list = ['0'] + jsi = JSIWrapper(self, url, timeout=120) + # bid 0 as an initial format checker - dash_paths = self._parse_json(PhantomJSwrapper(self, timeout=120_000).get( - url, note2='Executing signature code (this may take a couple minutes)', - html='', video_id=video_id, jscode=self._DASH_JS % { - 'tvid': video_info['tvId'], - 'vid': video_info['vid'], - 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), - expected_type=str, default='04022001010011000000'), - 'uid': uid, - 'dfp': self._get_cookie('dfp', ''), - 'mode': self._get_cookie('mod', 'intl'), - 'lang': self._get_cookie('lang', 'en_us'), - 'bid_list': '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']', - 'ut_list': '[' + ','.join(ut_list) + ']', - 'cmd5x_func': self._extract_cmd5x_function(webpage, video_id), - })[1].strip(), video_id) + dash_paths = self._parse_json(jsi.execute(self._DASH_JS % { + 'tvid': video_info['tvId'], + 'vid': video_info['vid'], + 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), + expected_type=str, default='04022001010011000000'), + 'uid': uid, + 'dfp': self._get_cookie('dfp', ''), + 'mode': self._get_cookie('mod', 'intl'), + 'lang': self._get_cookie('lang', 'en_us'), + 'bid_list': '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']', + 'ut_list': '[' + ','.join(ut_list) + ']', + 'cmd5x_func': self._extract_cmd5x_function(webpage, video_id), + }, video_id, html=''), video_id) formats, subtitles = [], {} initial_format_data = self._download_json( diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py deleted file mode 100644 index 2d56252b1..000000000 --- a/yt_dlp/extractor/openload.py +++ /dev/null @@ -1,243 +0,0 @@ -import collections -import contextlib -import json -import os -import subprocess -import tempfile -import urllib.parse - -from ..utils import ( - ExtractorError, - Popen, - check_executable, - format_field, - get_exe_version, - is_outdated_version, - shell_quote, -) - - -def cookie_to_dict(cookie): - cookie_dict = { - 'name': cookie.name, - 'value': cookie.value, - } - if cookie.port_specified: - cookie_dict['port'] = cookie.port - if cookie.domain_specified: - cookie_dict['domain'] = cookie.domain - if cookie.path_specified: - cookie_dict['path'] = cookie.path - if cookie.expires is not None: - cookie_dict['expires'] = cookie.expires - if cookie.secure is not None: - cookie_dict['secure'] = cookie.secure - if cookie.discard is not None: - cookie_dict['discard'] = cookie.discard - with contextlib.suppress(TypeError): - if (cookie.has_nonstandard_attr('httpOnly') - or cookie.has_nonstandard_attr('httponly') - or cookie.has_nonstandard_attr('HttpOnly')): - cookie_dict['httponly'] = True - return cookie_dict - - -def cookie_jar_to_list(cookie_jar): - return [cookie_to_dict(cookie) for cookie in cookie_jar] - - -class PhantomJSwrapper: - """PhantomJS wrapper class - - This class is experimental. - """ - - INSTALL_HINT = 'Please download it from https://phantomjs.org/download.html' - - _BASE_JS = R''' - phantom.onError = function(msg, trace) {{ - var msgStack = ['PHANTOM ERROR: ' + msg]; - if(trace && trace.length) {{ - msgStack.push('TRACE:'); - trace.forEach(function(t) {{ - msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line - + (t.function ? ' (in function ' + t.function +')' : '')); - }}); - }} - console.error(msgStack.join('\n')); - phantom.exit(1); - }}; - ''' - - _TEMPLATE = R''' - var page = require('webpage').create(); - var fs = require('fs'); - var read = {{ mode: 'r', charset: 'utf-8' }}; - var write = {{ mode: 'w', charset: 'utf-8' }}; - JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ - phantom.addCookie(x); - }}); - page.settings.resourceTimeout = {timeout}; - page.settings.userAgent = "{ua}"; - page.onLoadStarted = function() {{ - page.evaluate(function() {{ - delete window._phantom; - delete window.callPhantom; - }}); - }}; - var saveAndExit = function() {{ - fs.write("{html}", page.content, write); - fs.write("{cookies}", JSON.stringify(phantom.cookies), write); - phantom.exit(); - }}; - page.onLoadFinished = function(status) {{ - if(page.url === "") {{ - page.setContent(fs.read("{html}", read), "{url}"); - }} - else {{ - {jscode} - }} - }}; - page.open(""); - ''' - - _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - - @staticmethod - def _version(): - return get_exe_version('phantomjs', version_re=r'([0-9.]+)') - - def __init__(self, extractor, required_version=None, timeout=10000): - self._TMP_FILES = {} - - self.exe = check_executable('phantomjs', ['-v']) - if not self.exe: - raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True) - - self.extractor = extractor - - if required_version: - version = self._version() - if is_outdated_version(version, required_version): - self.extractor._downloader.report_warning( - 'Your copy of PhantomJS is outdated, update it to version ' - f'{required_version} or newer if you encounter any errors.') - - for name in self._TMP_FILE_NAMES: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.close() - self._TMP_FILES[name] = tmp - - self.options = collections.ChainMap({ - 'timeout': timeout, - }, { - x: self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') - for x in self._TMP_FILE_NAMES - }) - - def __del__(self): - for name in self._TMP_FILE_NAMES: - with contextlib.suppress(OSError, KeyError): - os.remove(self._TMP_FILES[name].name) - - def _save_cookies(self, url): - cookies = cookie_jar_to_list(self.extractor.cookiejar) - for cookie in cookies: - if 'path' not in cookie: - cookie['path'] = '/' - if 'domain' not in cookie: - cookie['domain'] = urllib.parse.urlparse(url).netloc - with open(self._TMP_FILES['cookies'].name, 'wb') as f: - f.write(json.dumps(cookies).encode()) - - def _load_cookies(self): - with open(self._TMP_FILES['cookies'].name, 'rb') as f: - cookies = json.loads(f.read().decode('utf-8')) - for cookie in cookies: - if cookie['httponly'] is True: - cookie['rest'] = {'httpOnly': None} - if 'expiry' in cookie: - cookie['expire_time'] = cookie['expiry'] - self.extractor._set_cookie(**cookie) - - def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): - """ - Downloads webpage (if needed) and executes JS - - Params: - url: website url - html: optional, html code of website - video_id: video id - note: optional, displayed when downloading webpage - note2: optional, displayed when executing JS - headers: custom http headers - jscode: code to be executed when page is loaded - - Returns tuple with: - * downloaded website (after JS execution) - * anything you print with `console.log` (but not inside `page.execute`!) - - In most cases you don't need to add any `jscode`. - It is executed in `page.onLoadFinished`. - `saveAndExit();` is mandatory, use it instead of `phantom.exit()` - It is possible to wait for some element on the webpage, e.g. - var check = function() { - var elementFound = page.evaluate(function() { - return document.querySelector('#b.done') !== null; - }); - if(elementFound) - saveAndExit(); - else - window.setTimeout(check, 500); - } - - page.evaluate(function(){ - document.querySelector('#a').click(); - }); - check(); - """ - if 'saveAndExit();' not in jscode: - raise ExtractorError('`saveAndExit();` not found in `jscode`') - if not html: - html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) - with open(self._TMP_FILES['html'].name, 'wb') as f: - f.write(html.encode()) - - self._save_cookies(url) - - user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] - jscode = self._TEMPLATE.format_map(self.options.new_child({ - 'url': url, - 'ua': user_agent.replace('"', '\\"'), - 'jscode': jscode, - })) - - stdout = self.execute(jscode, video_id, note=note2) - - with open(self._TMP_FILES['html'].name, 'rb') as f: - html = f.read().decode('utf-8') - self._load_cookies() - - return html, stdout - - def execute(self, jscode, video_id=None, *, note='Executing JS'): - """Execute JS and return stdout""" - if 'phantom.exit();' not in jscode: - jscode += ';\nphantom.exit();' - jscode = self._BASE_JS + jscode - - with open(self._TMP_FILES['script'].name, 'w', encoding='utf-8') as f: - f.write(jscode) - self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') - - cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name] - self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') - try: - stdout, stderr, returncode = Popen.run(cmd, timeout=self.options['timeout'] / 1000, - text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except Exception as e: - raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) - if returncode: - raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}') - - return stdout diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index e1e9777e8..a6180d4b4 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor -from .openload import PhantomJSwrapper +from ..jsinterp import PhantomJSwrapper from ..networking import Request from ..networking.exceptions import HTTPError from ..utils import ( diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 65182b971..9a683ae8f 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -6,7 +6,7 @@ from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE -from ..jsinterp import js_number_to_string +from ..jsinterp.native import js_number_to_string from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 55ebdce1b..148efcb3a 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -25,8 +25,7 @@ ) from .pot._director import initialize_pot_director from .pot.provider import PoTokenContext, PoTokenRequest -from ..openload import PhantomJSwrapper -from ...jsinterp import JSInterpreter +from ...jsinterp import JSInterpreter, PhantomJSwrapper from ...networking.exceptions import HTTPError from ...utils import ( NO_DEFAULT, diff --git a/yt_dlp/globals.py b/yt_dlp/globals.py index 0cf276cc9..917b1fa44 100644 --- a/yt_dlp/globals.py +++ b/yt_dlp/globals.py @@ -15,6 +15,7 @@ def __repr__(self, /): postprocessors = Indirect({}) extractors = Indirect({}) +jsi_runtimes = Indirect({}) # Plugins all_plugins_loaded = Indirect(False) @@ -23,7 +24,9 @@ def __repr__(self, /): plugin_ies = Indirect({}) plugin_pps = Indirect({}) +plugin_jsis = Indirect({}) plugin_ies_overrides = Indirect(defaultdict(list)) +plugin_jsis_overrides = Indirect(defaultdict(list)) # Misc IN_CLI = Indirect(False) diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py new file mode 100644 index 000000000..4052924a3 --- /dev/null +++ b/yt_dlp/jsinterp/__init__.py @@ -0,0 +1,27 @@ +# flake8: noqa: F401 +from .native import JSInterpreter +from .common import _JSI_PREFERENCES, JSIWrapper +from ._phantomjs import PhantomJSJSI, PhantomJSwrapper +from ._deno import DenoJSI, DenoJSDomJSI +from ..globals import jsi_runtimes, plugin_jsis +from ..plugins import PluginSpec, register_plugin_spec + +jsi_runtimes.value.update({ + name: value + for name, value in globals().items() + if name.endswith('JSI') +}) + +register_plugin_spec(PluginSpec( + module_name='jsinterp', + suffix='JSI', + destination=jsi_runtimes, + plugin_destination=plugin_jsis, +)) + +__all__ = [ + JSInterpreter, + PhantomJSwrapper, + _JSI_PREFERENCES, + JSIWrapper, +] diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py new file mode 100644 index 000000000..72264998a --- /dev/null +++ b/yt_dlp/jsinterp/_deno.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +import http.cookiejar +import json +import platform +import re +import subprocess +import typing +import urllib.parse + + +from ..utils import ( + ExtractorError, + Popen, + int_or_none, + shell_quote, + unified_timestamp, +) +from ._helper import TempFileWrapper, random_string, override_navigator_js, extract_script_tags +from .common import ExternalJSI + + +class DenoJSI(ExternalJSI): + """JS interpreter class using Deno binary""" + _BASE_PREFERENCE = 5 + _EXE_NAME = 'deno' + _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] + _INIT_SCRIPT = 'localStorage.clear(); delete window.Deno; global = window = globalThis;\n' + + def __init__(self, *args, flags=[], replace_flags=False, init_script=None, **kwargs): + super().__init__(*args, **kwargs) + self._flags = flags if replace_flags else [*self._DENO_FLAGS, *flags] + self._init_script = self._INIT_SCRIPT if init_script is None else init_script + + @property + def _override_navigator_js(self): + return override_navigator_js(self.user_agent) + + def _run_deno(self, cmd): + self.write_debug(f'Deno command line: {shell_quote(cmd)}') + try: + stdout, stderr, returncode = Popen.run( + cmd, timeout=self.timeout, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError('Unable to run Deno binary', cause=e) + if returncode: + raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') + elif stderr: + self.report_warning(f'JS console error msg:\n{stderr.strip()}') + return stdout.strip() + + def execute(self, jscode, video_id=None, note='Executing JS in Deno'): + self.report_note(video_id, note) + location_args = ['--location', self._url] if self._url else [] + with TempFileWrapper(f'{self._init_script};\n{self._override_navigator_js}\n{jscode}', suffix='.js') as js_file: + cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] + return self._run_deno(cmd) + + +class DenoJSDomJSI(DenoJSI): + _BASE_PREFERENCE = 4 + _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] + _JSDOM_VERSION = None + _JSDOM_URL = 'https://esm.sh/v135/jsdom' # force use esm v135, see esm-dev/esm.sh #1034 + + @staticmethod + def serialize_cookie(cookiejar: YoutubeDLCookieJar | None, url: str): + """serialize netscape-compatible fields from cookiejar for tough-cookie loading""" + # JSDOM use tough-cookie as its CookieJar https://github.com/jsdom/jsdom/blob/main/lib/api.js + # tough-cookie use Cookie.fromJSON and Cookie.toJSON for cookie serialization + # https://github.com/salesforce/tough-cookie/blob/master/lib/cookie/cookie.ts + if not cookiejar: + return json.dumps({'cookies': []}) + cookies: list[http.cookiejar.Cookie] = list(cookiejar.get_cookies_for_url(url)) + return json.dumps({'cookies': [{ + 'key': cookie.name, + 'value': cookie.value, + # leading dot of domain must be removed, otherwise will fail to match + 'domain': cookie.domain.lstrip('.') or urllib.parse.urlparse(url).hostname, + 'expires': int_or_none(cookie.expires, invscale=1000), + 'hostOnly': not cookie.domain_initial_dot, + 'secure': bool(cookie.secure), + 'path': cookie.path, + } for cookie in cookies if cookie.value]}) + + @staticmethod + def apply_cookies(cookiejar: YoutubeDLCookieJar | None, cookies: list[dict]): + """apply cookies from serialized tough-cookie""" + # see serialize_cookie + if not cookiejar: + return + for cookie_dict in cookies: + if not all(cookie_dict.get(k) for k in ('key', 'value', 'domain')): + continue + if cookie_dict.get('hostOnly'): + cookie_dict['domain'] = cookie_dict['domain'].lstrip('.') + else: + cookie_dict['domain'] = '.' + cookie_dict['domain'].lstrip('.') + + cookiejar.set_cookie(http.cookiejar.Cookie( + 0, cookie_dict['key'], cookie_dict['value'], + None, False, + cookie_dict['domain'], True, not cookie_dict.get('hostOnly'), + cookie_dict.get('path', '/'), True, + bool(cookie_dict.get('secure')), + unified_timestamp(cookie_dict.get('expires')), + False, None, None, {})) + + def _ensure_jsdom(self): + if self._JSDOM_VERSION: + return + # `--allow-import` is unsupported in v1, and esm.sh:443 is default allowed remote host for v2 + result = self._run_deno([self.exe, 'info', self._JSDOM_URL]) + version_line = next((line for line in result.splitlines() if self._JSDOM_URL in line), '') + if m := re.search(r'@([\d\.]+)', version_line): + self._JSDOM_VERSION = m[1] + + def report_version(self): + super().report_version() + self._ensure_jsdom() + self.write_debug(f'JSDOM lib version {self._JSDOM_VERSION}') + + def execute(self, jscode, video_id=None, note='Executing JS in Deno with jsdom', html='', cookiejar=None): + self.report_note(video_id, note) + self._ensure_jsdom() + + if cookiejar and not self._url: + self.report_warning('No valid url scope provided, cookiejar is not applied') + cookiejar = None + + html, inline_scripts = extract_script_tags(html) + wrapper_scripts = '\n'.join(['try { %s } catch (e) {}' % script for script in inline_scripts]) + + callback_varname = f'__callback_{random_string()}' + script = f'''{self._init_script}; + import jsdom from "{self._JSDOM_URL}"; + let {callback_varname} = (() => {{ + const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, self._url))}); + const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{ + {'url: %s,' % json.dumps(str(self._url)) if self._url else ''} + cookieJar: jar, + pretendToBeVisual: true, + }}); + Object.keys(dom.window).filter(key => !['atob', 'btoa', 'crypto', 'location'].includes(key)) + .filter(key => !(window.location? [] : ['sessionStorage', 'localStorage']).includes(key)) + .forEach((key) => {{ + try {{window[key] = dom.window[key]}} catch (e) {{ console.error(e) }} + }}); + {self._override_navigator_js}; + + window.screen = {{ + availWidth: 1920, + availHeight: 1040, + width: 1920, + height: 1080, + colorDepth: 24, + isExtended: true, + onchange: null, + orientation: {{angle: 0, type: 'landscape-primary', onchange: null}}, + pixelDepth: 24, + }} + Object.defineProperty(document.body, 'clientWidth', {{value: 1903}}); + Object.defineProperty(document.body, 'clientHeight', {{value: 2000}}); + document.domain = location?.hostname; + + delete window.jsdom; + const origLog = console.log; + console.log = () => {{}}; + console.info = () => {{}}; + return () => {{ + const stdout = []; + console.log = (...msg) => stdout.push(msg.map(m => '' + m).join(' ')); + return () => {{ origLog(JSON.stringify({{ + stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} + }} + }})(); + {wrapper_scripts} + {callback_varname} = {callback_varname}(); // begin to capture console.log + try {{ + {jscode} + }} finally {{ + {callback_varname}(); + }} + ''' + + # https://github.com/prebuild/node-gyp-build/blob/6822ec5/node-gyp-build.js#L196-L198 + # This jsdom dependency raises fatal error on linux unless read for this file is allowed + read_flag = ['--allow-read=/etc/alpine-release'] if platform.system() == 'Linux' else [] + + location_args = ['--location', self._url] if self._url else [] + + with TempFileWrapper(script, suffix='.js') as js_file: + cmd = [self.exe, 'run', *self._flags, *read_flag, *location_args, js_file.name] + result = self._run_deno(cmd) + try: + data = json.loads(result) + except json.JSONDecodeError as e: + raise ExtractorError(f'Failed to parse JSON output from Deno: {result}', cause=e) + self.apply_cookies(cookiejar, data['cookies']) + return data['stdout'] + + +if typing.TYPE_CHECKING: + from ..cookies import YoutubeDLCookieJar diff --git a/yt_dlp/jsinterp/_helper.py b/yt_dlp/jsinterp/_helper.py new file mode 100644 index 000000000..811366466 --- /dev/null +++ b/yt_dlp/jsinterp/_helper.py @@ -0,0 +1,135 @@ +from __future__ import annotations +import contextlib +import json +import os +import random +import re +import string +import tempfile + + +class TempFileWrapper: + """ + Wrapper for NamedTemporaryFile, auto closes file after io and deletes file upon wrapper object gc + + @param {str | bytes | None} content: content to write to file upon creation + @param {bool} text: whether to open file in text mode + @param {str} encoding: encoding to use for text mode + @param {str | None} suffix: suffix for filename of temporary file + """ + + def __init__(self, content: str | bytes | None = None, text: bool = True, + encoding='utf-8', suffix: str | None = None): + self.encoding = None if not text else encoding + self.text = text + self._file = tempfile.NamedTemporaryFile('w' if text else 'wb', encoding=self.encoding, + suffix=suffix, delete=False) + if content: + self._file.write(content) + self._file.close() + + @property + def name(self): + return self._file.name + + @contextlib.contextmanager + def opened_file(self, mode, *, seek=None, seek_whence=0): + mode = mode if (self.text or 'b' in mode) else mode + 'b' + with open(self._file.name, mode, encoding=self.encoding) as f: + if seek is not None: + self._file.seek(seek, seek_whence) + yield f + + def write(self, s, seek=None, seek_whence=0): + """re-open file in write mode and write, optionally seek to position first""" + with self.opened_file('w', seek=seek, seek_whence=seek_whence) as f: + return f.write(s) + + def append_write(self, s, seek=None, seek_whence=0): + """re-open file in append mode and write, optionally seek to position first""" + with self.opened_file('a', seek=seek, seek_whence=seek_whence) as f: + return f.write(s) + + def read(self, n=-1, seek=None, seek_whence=0): + """re-open file and read, optionally seek to position first""" + with self.opened_file('r', seek=seek, seek_whence=seek_whence) as f: + return f.read(n) + + def cleanup(self): + with contextlib.suppress(OSError): + os.remove(self._file.name) + + def __del__(self): + self.cleanup() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.cleanup() + + +def random_string(length: int = 10) -> str: + return ''.join(random.choices(string.ascii_letters, k=length)) + + +def override_navigator_js(user_agent: str) -> str: + """Generate js snippet to override navigator properties based on user_agent string""" + return '\n'.join([ + 'Object.defineProperty(navigator, "%s", { value: %s, configurable: true });' % (k, json.dumps(v)) + for k, v in { + 'userAgent': user_agent, + 'language': 'en-US', + 'languages': ['en-US'], + 'webdriver': False, + 'cookieEnabled': True, + 'appCodeName': user_agent.split('/', maxsplit=1)[0], + 'appName': 'Netscape', + 'appVersion': user_agent.split('/', maxsplit=1)[-1], + 'platform': 'Win32', + 'product': 'Gecko', + 'productSub': '20030107', + 'vendor': 'Google Inc.', + 'vendorSub': '', + 'onLine': True, + }.items() + ]) + + +def extract_script_tags(html: str) -> tuple[str, list[str]]: + script_indicies = [] + inline_scripts = [] + + for match_start in re.finditer(r']*>', html, re.DOTALL | re.IGNORECASE): + end = html.find('', match_start.end()) + if end > match_start.end(): + script_indicies.append((match_start.start(), end + len(''))) + inline_scripts.append(html[match_start.end():end]) + + for start, end in script_indicies: + html = html[:start] + html[end:] + + return html, inline_scripts + + +def prepare_wasm_jsmodule(js_mod: str, wasm: bytes) -> str: + """ + Sanitize js wrapper module generated by rust wasm-pack for wasm init + Removes export and import.meta, and inlines wasm binary as Uint8Array + See test/test_data/jsi_external/hello_wasm.js for example + + @param {str} js_mod: js wrapper module generated by rust wasm-pack + @param {bytes} wasm: wasm binary + """ + + js_mod = re.sub(r'export(?:\s+default)?([\s{])', r'\1', js_mod) + js_mod = js_mod.replace('import.meta', '{}') + + return js_mod + '''; + await (async () => { + const t = __wbg_get_imports(); + __wbg_init_memory(t); + const {module, instance} = await WebAssembly.instantiate(Uint8Array.from(%s), t); + __wbg_finalize_init(instance, module); + })(); + ''' % list(wasm) diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py new file mode 100644 index 000000000..ccd255052 --- /dev/null +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -0,0 +1,262 @@ +from __future__ import annotations + +import contextlib +import http.cookiejar +import json +import subprocess +import typing +import urllib.parse + + +from ..utils import ( + ExtractorError, + Popen, + filter_dict, + int_or_none, + is_outdated_version, + shell_quote, +) +from ._helper import TempFileWrapper, random_string, extract_script_tags +from .common import ExternalJSI + + +class PhantomJSJSI(ExternalJSI): + _EXE_NAME = 'phantomjs' + _BASE_PREFERENCE = 3 + + _BASE_JS = R''' + phantom.onError = function(msg, trace) {{ + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) {{ + msgStack.push('TRACE:'); + trace.forEach(function(t) {{ + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }}); + }} + console.error(msgStack.join('\n')); + phantom.exit(1); + }}; + ''' + + _TEMPLATE = R''' + var page = require('webpage').create(); + var fs = require('fs'); + var read = {{ mode: 'r', charset: 'utf-8' }}; + var write = {{ mode: 'w', charset: 'utf-8' }}; + page.settings.resourceTimeout = {timeout}; + page.settings.userAgent = {ua}; + page.onLoadStarted = function() {{ + page.evaluate(function() {{ + delete window._phantom; + delete window.callPhantom; + }}); + }}; + var saveAndExit = function() {{ + fs.write({html_fn}, page.content, write); + fs.write({cookies_fn}, JSON.stringify(phantom.cookies), write); + phantom.exit(); + }}; + var loaded = false; + page.onLoadFinished = function(status) {{ + if(page.url === "" && !loaded) {{ + page.setContent(fs.read({html_fn}, read), {url}); + loaded = true; + }} + else {{ + JSON.parse(fs.read({cookies_fn}, read)).forEach(function(x) {{ + phantom.addCookie(x); + }}); + {jscode} + }} + }}; + page.open(""); + ''' + + def _save_cookies(self, url, cookiejar: YoutubeDLCookieJar | None): + def _cookie_to_dict(cookie: http.cookiejar.Cookie): + cookie_dict = { + 'name': cookie.name, + 'value': cookie.value, + 'port': cookie.port, + 'domain': cookie.domain, + 'path': cookie.path or '/', + 'expires': int_or_none(cookie.expires, invscale=1000), + 'secure': cookie.secure, + 'discard': cookie.discard, + } + if not cookie_dict['domain']: + cookie_dict['domain'] = urllib.parse.urlparse(url).hostname + cookie_dict['port'] = urllib.parse.urlparse(url).port + with contextlib.suppress(TypeError): + if (cookie.has_nonstandard_attr('httpOnly') + or cookie.has_nonstandard_attr('httponly') + or cookie.has_nonstandard_attr('HttpOnly')): + cookie_dict['httponly'] = True + return filter_dict(cookie_dict) + + cookies = cookiejar.get_cookies_for_url(url) if cookiejar else [] + return json.dumps([_cookie_to_dict(cookie) for cookie in cookies]) + + def _load_cookies(self, cookies_json: str, cookiejar: YoutubeDLCookieJar | None): + if not cookiejar: + return + cookies = json.loads(cookies_json) + for cookie in cookies: + cookiejar.set_cookie(http.cookiejar.Cookie( + 0, cookie['name'], cookie['value'], cookie.get('port'), cookie.get('port') is not None, + cookie['domain'], True, cookie['domain'].startswith('.'), + cookie.get('path', '/'), True, + cookie.get('secure', False), cookie.get('expiry'), + cookie.get('discard', False), None, None, + {'httpOnly': None} if cookie.get('httponly') is True else {}, + )) + + def _execute(self, jscode: str, video_id=None, *, note='Executing JS in PhantomJS'): + """Execute JS and return stdout""" + if 'phantom.exit();' not in jscode: + jscode += ';\nphantom.exit();' + jscode = self._BASE_JS + jscode + + self.report_note(video_id, note) + with TempFileWrapper(jscode, suffix='.js') as js_file: + cmd = [self.exe, '--ssl-protocol=any', js_file.name] + self.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') + try: + stdout, stderr, returncode = Popen.run( + cmd, timeout=self.timeout, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) + if returncode: + raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}') + return stdout + + def _execute_html(self, jscode: str, url: str, html: str, cookiejar, video_id=None, note='Executing JS on webpage'): + if 'saveAndExit();' not in jscode: + raise ExtractorError('`saveAndExit();` not found in `jscode`') + + if cookiejar and not url: + self.report_warning('No valid url scope provided, cookiejar is not applied') + cookiejar = None + + html, inline_scripts = extract_script_tags(html) + wrapped_scripts = '\n'.join([ + 'page.evaluate(function() { try { %s } catch (e) {} });' % inline for inline in inline_scripts]) + + html_file = TempFileWrapper(html, suffix='.html') + cookie_file = TempFileWrapper(self._save_cookies(url, cookiejar), suffix='.json') + + script = self._TEMPLATE.format_map({ + 'url': json.dumps(str(url)), + 'ua': json.dumps(str(self.user_agent)), + 'jscode': f'{wrapped_scripts}\n{jscode}', + 'html_fn': json.dumps(html_file.name), + 'cookies_fn': json.dumps(cookie_file.name), + 'timeout': int(self.timeout * 1000), + }) + + stdout = self._execute(script, video_id, note=note) + self._load_cookies(cookie_file.read(), cookiejar) + new_html = html_file.read() + + return new_html, stdout + + def execute(self, jscode, video_id=None, note='Executing JS in PhantomJS', html='', cookiejar=None): + jscode = '''console.log(page.evaluate(function() { + var %(std_var)s = []; + console.log = function() { + var values = ''; + for (var i = 0; i < arguments.length; i++) { + values += arguments[i] + ' '; + } + %(std_var)s.push(values); + } + %(jscode)s; + return %(std_var)s.join('\\n'); + + })); + saveAndExit();''' % { + 'std_var': f'__stdout__values_{random_string()}', + 'jscode': jscode, + } + return self._execute_html(jscode, self._url, html, cookiejar, video_id=video_id, note=note)[1].strip() + + +class PhantomJSwrapper: + """PhantomJS wrapper class + + This class is experimental. + """ + INSTALL_HINT = 'Please download PhantomJS from https://phantomjs.org/download.html' + + @classmethod + def _version(cls): + return PhantomJSJSI.exe_version + + def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): + self._jsi = PhantomJSJSI(extractor._downloader, '', timeout / 1000, {}) + + if not self._jsi.is_available(): + raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True) + + self.extractor = extractor + + if required_version: + if is_outdated_version(self._jsi.exe_version, required_version): + self._jsi.report_warning( + 'Your copy of PhantomJS is outdated, update it to version ' + f'{required_version} or newer if you encounter any errors.') + + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): + """ + Downloads webpage (if needed) and executes JS + + Params: + url: website url + html: optional, html code of website + video_id: video id + note: optional, displayed when downloading webpage + note2: optional, displayed when executing JS + headers: custom http headers + jscode: code to be executed when page is loaded + + Returns tuple with: + * downloaded website (after JS execution) + * anything you print with `console.log` (but not inside `page.execute`!) + + In most cases you don't need to add any `jscode`. + It is executed in `page.onLoadFinished`. + `saveAndExit();` is mandatory, use it instead of `phantom.exit()` + It is possible to wait for some element on the webpage, e.g. + var check = function() { + var elementFound = page.evaluate(function() { + return document.querySelector('#b.done') !== null; + }); + if(elementFound) + saveAndExit(); + else + window.setTimeout(check, 500); + } + + page.evaluate(function(){ + document.querySelector('#a').click(); + }); + check(); + """ + if 'saveAndExit();' not in jscode: + raise ExtractorError('`saveAndExit();` not found in `jscode`') + if not html: + html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) + + self._jsi.user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] + + return self._jsi._execute_html(jscode, url, html, self.extractor.cookiejar, video_id=video_id, note=note2) + + def execute(self, jscode, video_id=None, *, note='Executing JS in PhantomJS'): + """Execute JS and return stdout""" + return self._jsi.execute(jscode, video_id=video_id, note=note) + + +if typing.TYPE_CHECKING: + from ..extractor.common import InfoExtractor + from ..cookies import YoutubeDLCookieJar diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py new file mode 100644 index 000000000..f8ac233fb --- /dev/null +++ b/yt_dlp/jsinterp/common.py @@ -0,0 +1,315 @@ +from __future__ import annotations + +import abc +import inspect +import sys +import typing + +from ..globals import jsi_runtimes, plugin_jsis_overrides +from ..extractor.common import InfoExtractor +from ..utils import ( + classproperty, + format_field, + filter_dict, + get_exe_version, + url_or_none, + sanitize_url, + ExtractorError, +) + +_JSI_PREFERENCES: set[JSIPreference] = set() + + +def get_all_handlers() -> dict[str, type[JSI]]: + return {jsi.JSI_KEY: jsi for jsi in jsi_runtimes.value.values()} + + +def to_jsi_keys(jsi_or_keys: typing.Iterable[str | type[JSI] | JSI]) -> list[str]: + return [jok if isinstance(jok, str) else jok.JSI_KEY for jok in jsi_or_keys] + + +def get_included_jsi(only_include=None, exclude=None): + return { + key: value for key, value in get_all_handlers().items() + if (not only_include or key in to_jsi_keys(only_include)) + and (not exclude or key not in to_jsi_keys(exclude)) + } + + +def order_to_pref(jsi_order: typing.Iterable[str | type[JSI] | JSI], multiplier: int) -> JSIPreference: + """convert a list of jsi keys into a preference function""" + jsi_order = reversed(to_jsi_keys(jsi_order)) + pref_score = {jsi_cls: (i + 1) * multiplier for i, jsi_cls in enumerate(jsi_order)} + + def _pref(jsi: JSI, *args): + return pref_score.get(jsi.JSI_KEY, 0) + return _pref + + +class JSIWrapper: + """ + Helper class to forward JS interp request to a JSI that supports it. + + Usage: + ``` + def _real_extract(self, url): + ... + jsi = JSIWrapper(self, url) + result = jsi.execute(jscode, video_id) + ... + ``` + + @param dl_or_ie: `YoutubeDL` or `InfoExtractor` instance. + @param url: setting url context + @param only_include: limit JSI to choose from. + @param exclude: JSI to avoid using. + @param jsi_params: extra kwargs to pass to `JSI.__init__()` for each JSI, using jsi key as dict key. + @param preferred_order: list of JSI to try before others. First in list is tried first. + @param timeout: timeout parameter for all chosen JSI + @param user_agent: specify user-agent to use, default to downloader UA + """ + + def __init__( + self, + dl_or_ie: YoutubeDL | InfoExtractor, + url: str = '', + only_include: typing.Iterable[str | type[JSI]] = [], + exclude: typing.Iterable[str | type[JSI]] = [], + jsi_params: dict[str, dict] = {}, + preferred_order: typing.Iterable[str | type[JSI]] = [], + timeout: float | int = 10, + user_agent: str | None = None, + ): + if isinstance(dl_or_ie, InfoExtractor): + self._downloader = dl_or_ie._downloader + self._ie_key = dl_or_ie.ie_key() + else: + self._downloader = dl_or_ie + self._ie_key = None + + self._url = self._sanitize_url(url) + self.preferences: set[JSIPreference] = { + order_to_pref(self._load_jsi_keys_from_option('jsi_preference'), 10000), + order_to_pref(preferred_order, 100), + } | _JSI_PREFERENCES + + handler_classes = self._load_allowed_jsi_cls(only_include, exclude) + if not handler_classes: + raise ExtractorError('No JSI is allowed to use') + + user_agent = user_agent or self._downloader.params['http_headers']['User-Agent'] + self._handler_dict = {cls.JSI_KEY: cls( + self._downloader, url=self._url, timeout=timeout, + user_agent=user_agent, **jsi_params.get(cls.JSI_KEY, {}), + ) for cls in handler_classes.values()} + + self._is_test = self._downloader.params.get('test', False) + + def _sanitize_url(self, url): + sanitized = sanitize_url(url_or_none(url)) or '' + if url and not sanitized: + self.report_warning(f'Invalid URL: "{url}", using empty string instead') + return sanitized + + def _load_jsi_keys_from_option(self, option_key): + jsi_keys = self._downloader.params.get(option_key, []) + valid_handlers = list(get_all_handlers()) + for invalid_key in [key for key in jsi_keys if key not in valid_handlers]: + self.report_warning(f'{option_key}: `{invalid_key}` is not a valid JSI', only_once=True) + jsi_keys.remove(invalid_key) + return jsi_keys + + def _load_allowed_jsi_cls(self, only_include, exclude): + self.write_debug(f'Loaded JSI runtimes: {get_all_handlers()}') + handler_classes = filter_dict( + get_included_jsi(only_include, exclude), + lambda _, v: v.supports_extractor(self._ie_key)) + self.write_debug(f'Select JSI {"for " + self._ie_key if self._ie_key else ""}: {to_jsi_keys(handler_classes)}, ' + f'included: {to_jsi_keys(only_include) or "all"}, excluded: {to_jsi_keys(exclude)}') + return handler_classes + + def write_debug(self, message, only_once=False): + return self._downloader.write_debug(f'[JSIDirector] {message}', only_once=only_once) + + def report_warning(self, message, only_once=False): + return self._downloader.report_warning(f'[JSIDirector] {message}', only_once=only_once) + + def _get_handlers(self, method_name: str, *args, **kwargs) -> list[JSI]: + def _supports_method_with_params(jsi: JSI): + if not callable(method := getattr(jsi, method_name, None)): + return False + method_params = inspect.signature(method).parameters + return all(key in method_params for key in kwargs) + + handlers = [h for h in self._handler_dict.values() if _supports_method_with_params(h)] + self.write_debug(f'Choosing handlers for method `{method_name}` with kwargs {list(kwargs)}' + f': {to_jsi_keys(handlers)}') + + if not handlers: + raise ExtractorError(f'No JSI supports method `{method_name}` with kwargs {list(kwargs)}, ' + f'included handlers: {to_jsi_keys(self._handler_dict.values())}') + + preferences = { + handler.JSI_KEY: sum(pref_func(handler, method_name, args, kwargs) for pref_func in self.preferences) + for handler in handlers + } + self.write_debug('JSI preferences for `{}` request: {}'.format( + method_name, ', '.join(f'{key}={pref}' for key, pref in preferences.items()))) + + return sorted(handlers, key=lambda h: preferences[h.JSI_KEY], reverse=True) + + def _dispatch_request(self, method_name: str, *args, **kwargs): + handlers = self._get_handlers(method_name, *args, **kwargs) + + unavailable: list[str] = [] + exceptions: list[tuple[JSI, Exception]] = [] + + for handler in handlers: + if not handler.is_available(): + if self._is_test: + raise ExtractorError(f'{handler.JSI_NAME} is not available for testing, ' + f'add "{handler.JSI_KEY}" in `exclude` if it should not be used') + self.write_debug(f'{handler.JSI_KEY} is not available') + unavailable.append(handler.JSI_NAME) + continue + + try: + self.write_debug(f'Dispatching `{method_name}` task to {handler.JSI_NAME}') + handler.report_version() + return getattr(handler, method_name)(*args, **kwargs) + except ExtractorError as e: + if self._is_test: + raise ExtractorError(f'{handler.JSI_NAME} got error while evaluating js, ' + f'add "{handler.JSI_KEY}" in `exclude` if it should not be used') + exceptions.append((handler, e)) + self.write_debug(f'{handler.JSI_NAME} encountered error, fallback to next handler: {e}') + + if not exceptions: + msg = f'No available JSI installed, please install one of: {", ".join(unavailable)}' + else: + msg = f'Failed to perform {method_name}, total {len(exceptions)} errors' + if unavailable: + msg = f'{msg}. You may try installing one of unavailable JSI: {", ".join(unavailable)}' + raise ExtractorError(msg) + + def execute(self, jscode: str, video_id: str | None, note: str | None = None, + html: str | None = None, cookiejar: YoutubeDLCookieJar | None = None) -> str: + """ + Execute JS code and return stdout from console.log + + @param jscode: JS code to execute + @param video_id + @param note + @param html: html to load as document + @param cookiejar: cookiejar to read and set cookies, pass `InfoExtractor.cookiejar` if you want to read and write cookies + """ + return self._dispatch_request('execute', jscode, video_id, **filter_dict({ + 'note': note, 'html': html, 'cookiejar': cookiejar})) + + +class JSI(abc.ABC): + _BASE_PREFERENCE: int = 0 + + def __init__(self, downloader: YoutubeDL, url: str, timeout: float | int, user_agent=None): + self._downloader = downloader + self._url = url + self.timeout = timeout + self.user_agent: str = user_agent or self._downloader.params['http_headers']['User-Agent'] + + @classmethod + def __init_subclass__(cls, *, plugin_name=None, **kwargs): + if plugin_name: + mro = inspect.getmro(cls) + next_mro_class = super_class = mro[mro.index(cls) + 1] + + while getattr(super_class, '__wrapped__', None): + super_class = super_class.__wrapped__ + + if not any(override.PLUGIN_NAME == plugin_name for override in plugin_jsis_overrides.value[super_class]): + cls.__wrapped__ = next_mro_class + cls.PLUGIN_NAME, cls.JSI_KEY = plugin_name, next_mro_class.JSI_KEY + cls.JSI_NAME = f'{next_mro_class.JSI_NAME}+{plugin_name}' + + setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + # additional update jsi_runtime because jsis are not further loaded like extractors + jsi_runtimes.value[super_class.JSI_KEY] = cls + plugin_jsis_overrides.value[super_class].append(cls) + return super().__init_subclass__(**kwargs) + + @abc.abstractmethod + def is_available(self) -> bool: + raise NotImplementedError + + def write_debug(self, msg, *args, **kwargs): + self._downloader.write_debug(f'[{self.JSI_NAME}] {msg}', *args, **kwargs) + + def report_warning(self, msg, *args, **kwargs): + self._downloader.report_warning(f'[{self.JSI_NAME}] {msg}', *args, **kwargs) + + def to_screen(self, msg, *args, **kwargs): + self._downloader.to_screen(f'[{self.JSI_NAME}] {msg}', *args, **kwargs) + + def report_note(self, video_id, note): + self.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + + def report_version(self): + return + + @classmethod + def supports_extractor(cls, ie_key: str): + return True + + @classproperty + def JSI_NAME(cls) -> str: + return cls.__name__[:-3] + + @classproperty + def JSI_KEY(cls) -> str: + assert cls.__name__.endswith('JSI'), 'JSI class names must end with "JSI"' + return cls.__name__[:-3] + + +class ExternalJSI(JSI, abc.ABC): + _EXE_NAME: str + + @classproperty(cache=True) + def exe_version(cls): + return get_exe_version(cls._EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') + + @classproperty + def exe(cls): + return cls._EXE_NAME if cls.exe_version else None + + @classmethod + def is_available(cls): + return bool(cls.exe) + + def report_version(self): + self.write_debug(f'{self._EXE_NAME} version {self.exe_version}') + + +def register_jsi_preference(*handlers: type[JSI]): + assert all(issubclass(handler, JSI) for handler in handlers), f'{handlers} must all be a subclass of JSI' + + def outer(pref_func: JSIPreference) -> JSIPreference: + def inner(handler: JSI, *args): + if not handlers or isinstance(handler, handlers): + return pref_func(handler, *args) + return 0 + _JSI_PREFERENCES.add(inner) + return inner + return outer + + +@register_jsi_preference() +def _base_preference(handler: JSI, *args): + return min(10, getattr(handler, '_BASE_PREFERENCE', 0)) + + +if typing.TYPE_CHECKING: + from ..YoutubeDL import YoutubeDL + from ..cookies import YoutubeDLCookieJar + + class JSIPreference(typing.Protocol): + def __call__(self, handler: JSI, method_name: str, *args, **kwargs) -> int: + ... diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp/native.py similarity index 99% rename from yt_dlp/jsinterp.py rename to yt_dlp/jsinterp/native.py index 45aeffa22..2812d28c1 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp/native.py @@ -6,7 +6,7 @@ import operator import re -from .utils import ( +from ..utils import ( NO_DEFAULT, ExtractorError, function_with_repr, diff --git a/yt_dlp/options.py b/yt_dlp/options.py index b4d3d4d66..f347696a1 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1192,6 +1192,11 @@ def _preset_alias_callback(option, opt_str, value, parser): '--sleep-subtitles', metavar='SECONDS', dest='sleep_interval_subtitles', default=0, type=int, help='Number of seconds to sleep before each subtitle download') + workarounds.add_option( + '--jsi-preference', + metavar='JSI', dest='jsi_preference', default=[], type='str', action='callback', + callback=_list_from_options_callback, + help='Preferred JS interpreters to use during extraction. Can be given as comma-separated values.') verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options') verbosity.add_option(