diff --git a/test/test_jsinterp_external.py b/test/test_jsinterp_external.py index 7f66c032a..9cc4c970f 100644 --- a/test/test_jsinterp_external.py +++ b/test/test_jsinterp_external.py @@ -1,9 +1,14 @@ #!/usr/bin/env python3 -# Allow direct execution +from __future__ import annotations import os +import dataclasses +import datetime +import time import sys import unittest +import http.cookiejar + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -11,14 +16,43 @@ from test.helper import ( FakeYDL, ) +from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.jsinterp.common import ExternalJSI from yt_dlp.jsinterp._deno import DenoJSI, DenoJITlessJSI, DenoJSDomJSI from yt_dlp.jsinterp._phantomjs import PhantomJSJSI +@dataclasses.dataclass +class NetscapeFields: + name: str + value: str + domain: str + path: str + secure: bool + expires: int | None + + def to_cookie(self): + return http.cookiejar.Cookie( + 0, self.name, self.value, + None, False, + self.domain, True, self.domain.startswith('.'), + self.path, True, + self.secure, self.expires, False, + None, None, {}, + ) + + def expire_str(self): + return datetime.datetime.fromtimestamp( + self.expires, datetime.timezone.utc).strftime('%a, %d %b %Y %H:%M:%S GMT') + + def __eq__(self, other: NetscapeFields | http.cookiejar.Cookie): + return all(getattr(self, attr) == getattr(other, attr) for attr in ['name', 'value', 'domain', 'path', 'secure', 'expires']) + + class Base: class TestExternalJSI(unittest.TestCase): _JSI_CLASS: type[ExternalJSI] = None + maxDiff = 2000 def setUp(self): self.ydl = FakeYDL() @@ -52,6 +86,7 @@ def test_execute_dom_script(self): '''), 'Hello, world!') @@ -59,8 +94,6 @@ def test_execute_dom_script(self): def test_execute_dom_script_with_error(self): if 'dom' not in self.jsi._SUPPORTED_FEATURES: self.skipTest('DOM not supported') - if self.jsi.JSI_KEY == 'PhantomJS': - self.skipTest('PhantomJS does not catch errors') self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', location='https://example.com', @@ -69,11 +102,67 @@ def test_execute_dom_script_with_error(self): '''), 'Hello, world!') + def assert_cookiejar_equal(self, cookiejar: http.cookiejar.CookieJar, ref_cookiejar: http.cookiejar.CookieJar): + for cookie in cookiejar: + ref_cookie = next((c for c in ref_cookiejar if c.name == cookie.name and c.domain == cookie.domain), None) + self.assertEqual(repr(cookie), repr(ref_cookie)) + + def assert_cookie_str_equal(self, cookie_str, ref_cookie_str): + print([cookie_str, ref_cookie_str]) + self.assertEqual(set(cookie_str.split('; ')), set(ref_cookie_str.split('; '))) + + def test_execute_cookiejar(self): + if 'cookies' not in self.jsi._SUPPORTED_FEATURES: + self.skipTest('Cookies not supported') + cookiejar = YoutubeDLCookieJar() + ref_cookiejar = YoutubeDLCookieJar() + for test_cookie in [ + NetscapeFields('test1', 'test1', '.example.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test2', 'test2', '.example.com', '/', True, int(time.time()) + 1000), + NetscapeFields('test3', 'test3', '.example.com', '/123', False, int(time.time()) + 1000), + NetscapeFields('test4', 'test4', '.example.com', '/456', False, int(time.time()) + 1000), + NetscapeFields('test5', 'test5', '.example.com', '/123', True, int(time.time()) + 1000), + NetscapeFields('test6', 'test6', '.example.com', '/456', True, int(time.time()) + 1000), + NetscapeFields('test1', 'other1', '.other.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test2', 'other2', '.other.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test7', 'other7', '.other.com', '/', False, int(time.time()) + 1000), + ]: + cookiejar.set_cookie(test_cookie.to_cookie()) + ref_cookiejar.set_cookie(test_cookie.to_cookie()) + + # test identity without modification from js + self.assert_cookie_str_equal(self.jsi.execute( + 'console.log(document.cookie);', + location='http://example.com/123/456', + html='
Hello, world!
', + cookiejar=cookiejar), + 'test1=test1; test3=test3') + self.assert_cookiejar_equal(cookiejar, ref_cookiejar) + + # test modification of existing cookie from js + new_cookie_1 = NetscapeFields('test1', 'new1', '.example.com', '/', True, int(time.time()) + 900) + new_cookie_2 = NetscapeFields('test2', 'new2', '.example.com', '/', True, int(time.time()) + 900) + ref_cookiejar.set_cookie(new_cookie_1.to_cookie()) + ref_cookiejar.set_cookie(new_cookie_2.to_cookie()) + self.assert_cookie_str_equal(self.jsi.execute( + f'''document.cookie = "test1=new1; secure; expires={new_cookie_1.expire_str()}; domain=.example.com; path=/"; + console.log(document.cookie);''', + location='https://example.com/123/456', + html=f'''
Hello, world!
+ + ''', + cookiejar=cookiejar), + 'test1=new1; test2=new2; test3=test3; test5=test5') + self.assert_cookiejar_equal(cookiejar, ref_cookiejar) + class TestDeno(Base.TestExternalJSI): _JSI_CLASS = DenoJSI diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index a87c104f7..9a3083c19 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -2,7 +2,6 @@ import http.cookiejar import json -import re import subprocess import typing import urllib.parse @@ -16,7 +15,7 @@ shell_quote, unified_timestamp, ) -from ._helper import TempFileWrapper, random_string +from ._helper import TempFileWrapper, random_string, override_navigator_js, extract_script_tags from .common import ExternalJSI, register_jsi @@ -36,15 +35,7 @@ def __init__(self, *args, flags=[], replace_flags=False, init_script=None, **kwa @property def _override_navigator_js(self): - return '\n'.join([ - 'Object.defineProperty(navigator, "%s", { value: %s, configurable: true });' % (k, json.dumps(v)) - for k, v in { - 'userAgent': self.user_agent, - 'language': 'en-US', - 'languages': ['en-US'], - 'webdriver': False, - }.items() - ]) + return override_navigator_js(self.user_agent) def _run_deno(self, cmd): self.write_debug(f'Deno command line: {shell_quote(cmd)}') @@ -137,21 +128,13 @@ def _ensure_jsdom(self): self._run_deno(cmd) self._JSDOM_IMPORT_CHECKED = True - def _parse_script_tags(self, html: str): - for match_start in re.finditer(r']*>', html, re.DOTALL): - end = html.find('', match_start.end()) - if end > match_start.end(): - yield html[match_start.end():end] - def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None): self.report_note(video_id, note) self._ensure_jsdom() callback_varname = f'__callback_{random_string()}' - inline_scripts = '\n'.join([ - 'try { %s } catch (e) {}' % script - for script in self._parse_script_tags(html) - ]) + html, inline_scripts = extract_script_tags(html) + wrapper_scripts = '\n'.join(['try { %s } catch (e) {}' % script for script in inline_scripts]) script = f'''{self._init_script}; {self._override_navigator_js}; @@ -164,27 +147,33 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' }}); Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); delete window.jsdom; + const origLog = console.log; + console.log = () => {{}}; + console.info = () => {{}}; return () => {{ const stdout = []; - const origLog = console.log; console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); return () => {{ origLog(JSON.stringify({{ stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} }} }})(); - await (async () => {{ - {inline_scripts} - }})(); - {callback_varname} = {callback_varname}(); - await (async () => {{ + {wrapper_scripts} + {callback_varname} = {callback_varname}(); // begin to capture console.log + try {{ {jscode} - }})().finally({callback_varname}); + }} finally {{ + {callback_varname}(); + }} ''' location_args = ['--location', location] if location else [] with TempFileWrapper(script, suffix='.js') as js_file: cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] - data = json.loads(self._run_deno(cmd)) + result = self._run_deno(cmd) + try: + data = json.loads(result) + except json.JSONDecodeError as e: + raise ExtractorError(f'Failed to parse JSON output from Deno: {result}', cause=e) self.apply_cookies(cookiejar, data['cookies']) return data['stdout'] diff --git a/yt_dlp/jsinterp/_helper.py b/yt_dlp/jsinterp/_helper.py index 22525018d..dcf27deb4 100644 --- a/yt_dlp/jsinterp/_helper.py +++ b/yt_dlp/jsinterp/_helper.py @@ -1,7 +1,9 @@ from __future__ import annotations import contextlib +import json import os import random +import re import string import tempfile @@ -69,3 +71,31 @@ def __exit__(self, exc_type, exc_value, traceback): def random_string(length: int = 10) -> str: return ''.join(random.choices(string.ascii_letters, k=length)) + + +def override_navigator_js(user_agent: str) -> str: + return '\n'.join([ + 'Object.defineProperty(navigator, "%s", { value: %s, configurable: true });' % (k, json.dumps(v)) + for k, v in { + 'userAgent': user_agent, + 'language': 'en-US', + 'languages': ['en-US'], + 'webdriver': False, + }.items() + ]) + + +def extract_script_tags(html: str) -> tuple[str, list[str]]: + script_indicies = [] + inline_scripts = [] + + for match_start in re.finditer(r']*>', html, re.DOTALL): + end = html.find('', match_start.end()) + if end > match_start.end(): + script_indicies.append((match_start.start(), end + len(''))) + inline_scripts.append(html[match_start.end():end]) + + for start, end in script_indicies: + html = html[:start] + html[end:] + + return html, inline_scripts diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py index 9f03cd7d5..945ee5c9b 100644 --- a/yt_dlp/jsinterp/_phantomjs.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -16,7 +16,7 @@ is_outdated_version, shell_quote, ) -from ._helper import TempFileWrapper, random_string +from ._helper import TempFileWrapper, random_string, extract_script_tags from .common import ExternalJSI, register_jsi @@ -135,19 +135,23 @@ def _execute_html(self, jscode: str, url: str, html: str, cookiejar, video_id=No if 'saveAndExit();' not in jscode: raise ExtractorError('`saveAndExit();` not found in `jscode`') + html, inline_scripts = extract_script_tags(html) + wrapped_scripts = '\n'.join([ + 'page.evaluate(function() { try { %s } catch (e) {} });' % inline for inline in inline_scripts]) + html_file = TempFileWrapper(html, suffix='.html') cookie_file = TempFileWrapper(self._save_cookies(url, cookiejar), suffix='.json') - jscode = self._TEMPLATE.format_map({ + script = self._TEMPLATE.format_map({ 'url': json.dumps(str(url)), 'ua': json.dumps(str(self.user_agent)), - 'jscode': jscode, + 'jscode': f'{wrapped_scripts}\n{jscode}', 'html_fn': json.dumps(html_file.name), 'cookies_fn': json.dumps(cookie_file.name), 'timeout': int(self.timeout * 1000), }) - stdout = self._execute(jscode, video_id, note=note) + stdout = self._execute(script, video_id, note=note) self._load_cookies(cookie_file.read(), cookiejar) new_html = html_file.read()