Merge branch 'yt-dlp:master' into ciscolive

2026-02-07 22:47:24 +00:00 · 2025-06-13 15:36:21 +09:00
parent 3f3dbb6843 1722c55400
commit 66273ed7c0
7 changed files with 645 additions and 24 deletions
--- a/yt_dlp/extractor/brightcove.py
+++ b/yt_dlp/extractor/brightcove.py
@@ -495,8 +495,6 @@ class BrightcoveLegacyIE(InfoExtractor):

 class BrightcoveNewBaseIE(AdobePassIE):
    def _parse_brightcove_metadata(self, json_data, video_id, headers={}):
-        title = json_data['name'].strip()
-
        formats, subtitles = [], {}
        sources = json_data.get('sources') or []
        for source in sources:
@@ -600,16 +598,18 @@ class BrightcoveNewBaseIE(AdobePassIE):

        return {
            'id': video_id,
-            'title': title,
-            'description': clean_html(json_data.get('description')),
            'thumbnails': thumbnails,
            'duration': duration,
-            'timestamp': parse_iso8601(json_data.get('published_at')),
-            'uploader_id': json_data.get('account_id'),
            'formats': formats,
            'subtitles': subtitles,
-            'tags': json_data.get('tags', []),
            'is_live': is_live,
+            **traverse_obj(json_data, {
+                'title': ('name', {clean_html}),
+                'description': ('description', {clean_html}),
+                'tags': ('tags', ..., {str}, filter, all, filter),
+                'timestamp': ('published_at', {parse_iso8601}),
+                'uploader_id': ('account_id', {str}),
+            }),
        }


@@ -645,10 +645,7 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
            'uploader_id': '4036320279001',
            'formats': 'mincount:39',
        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
+        'skip': '404 Not Found',
    }, {
        # playlist stream
        'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001',
@@ -709,7 +706,6 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
                'ext': 'mp4',
                'title': 'TGD_01-032_5',
                'thumbnail': r're:^https?://.*\.jpg$',
-                'tags': [],
                'timestamp': 1646078943,
                'uploader_id': '1569565978001',
                'upload_date': '20220228',
@@ -721,7 +717,6 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
                'ext': 'mp4',
                'title': 'TGD 01-087 (Airs 05.25.22)_Segment 5',
                'thumbnail': r're:^https?://.*\.jpg$',
-                'tags': [],
                'timestamp': 1651604591,
                'uploader_id': '1569565978001',
                'upload_date': '20220503',
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -101,6 +101,7 @@ from ..utils import (
    xpath_with_ns,
 )
 from ..utils._utils import _request_dump_filename
+from ..utils.jslib import devalue


 class InfoExtractor:
@@ -1795,6 +1796,63 @@ class InfoExtractor:
        ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
        return traverse_obj(ret, traverse) or {}

+    def _resolve_nuxt_array(self, array, video_id, *, fatal=True, default=NO_DEFAULT):
+        """Resolves Nuxt rich JSON payload arrays"""
+        # Ref: https://github.com/nuxt/nuxt/commit/9e503be0f2a24f4df72a3ccab2db4d3e63511f57
+        #      https://github.com/nuxt/nuxt/pull/19205
+        if default is not NO_DEFAULT:
+            fatal = False
+
+        if not isinstance(array, list) or not array:
+            error_msg = 'Unable to resolve Nuxt JSON data: invalid input'
+            if fatal:
+                raise ExtractorError(error_msg, video_id=video_id)
+            elif default is NO_DEFAULT:
+                self.report_warning(error_msg, video_id=video_id)
+            return {} if default is NO_DEFAULT else default
+
+        def indirect_reviver(data):
+            return data
+
+        def json_reviver(data):
+            return json.loads(data)
+
+        gen = devalue.parse_iter(array, revivers={
+            'NuxtError': indirect_reviver,
+            'EmptyShallowRef': json_reviver,
+            'EmptyRef': json_reviver,
+            'ShallowRef': indirect_reviver,
+            'ShallowReactive': indirect_reviver,
+            'Ref': indirect_reviver,
+            'Reactive': indirect_reviver,
+        })
+
+        while True:
+            try:
+                error_msg = f'Error resolving Nuxt JSON: {gen.send(None)}'
+                if fatal:
+                    raise ExtractorError(error_msg, video_id=video_id)
+                elif default is NO_DEFAULT:
+                    self.report_warning(error_msg, video_id=video_id, only_once=True)
+                else:
+                    self.write_debug(f'{video_id}: {error_msg}', only_once=True)
+            except StopIteration as error:
+                return error.value or ({} if default is NO_DEFAULT else default)
+
+    def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT):
+        """Parses metadata from Nuxt rich JSON payloads embedded in HTML"""
+        passed_default = default is not NO_DEFAULT
+
+        array = self._search_json(
+            r'<script\b[^>]+\bid="__NUXT_DATA__"[^>]*>', webpage,
+            'Nuxt JSON data', video_id, contains_pattern=r'\[(?s:.+)\]',
+            fatal=fatal, default=NO_DEFAULT if not passed_default else None)
+
+        if not array:
+            return default if passed_default else {}
+
+        return self._resolve_nuxt_array(array, video_id, fatal=fatal, default=default)
+
    @staticmethod
    def _hidden_inputs(html):
        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
--- a/yt_dlp/extractor/hypergryph.py
+++ b/yt_dlp/extractor/hypergryph.py
@@ -1,32 +1,66 @@
 from .common import InfoExtractor
-from ..utils import js_to_json, traverse_obj
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    url_or_none,
+)
+from ..utils.traversal import subs_list_to_dict, traverse_obj


 class MonsterSirenHypergryphMusicIE(InfoExtractor):
+    IE_NAME = 'monstersiren'
+    IE_DESC = '塞壬唱片'
+    _API_BASE = 'https://monster-siren.hypergryph.com/api'
    _VALID_URL = r'https?://monster-siren\.hypergryph\.com/music/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://monster-siren.hypergryph.com/music/514562',
        'info_dict': {
            'id': '514562',
            'ext': 'wav',
-            'artists': ['塞壬唱片-MSR'],
-            'album': 'Flame Shadow',
            'title': 'Flame Shadow',
+            'album': 'Flame Shadow',
+            'artists': ['塞壬唱片-MSR'],
+            'description': 'md5:19e2acfcd1b65b41b29e8079ab948053',
+            'thumbnail': r're:https?://web\.hycdn\.cn/siren/pic/.+\.jpg',
+        },
+    }, {
+        'url': 'https://monster-siren.hypergryph.com/music/514518',
+        'info_dict': {
+            'id': '514518',
+            'ext': 'wav',
+            'title': 'Heavenly Me (Instrumental)',
+            'album': 'Heavenly Me',
+            'artists': ['塞壬唱片-MSR', 'AIYUE blessed : 理名'],
+            'description': 'md5:ce790b41c932d1ad72eb791d1d8ae598',
+            'thumbnail': r're:https?://web\.hycdn\.cn/siren/pic/.+\.jpg',
        },
    }]

    def _real_extract(self, url):
        audio_id = self._match_id(url)
-        webpage = self._download_webpage(url, audio_id)
-        json_data = self._search_json(
-            r'window\.g_initialProps\s*=', webpage, 'data', audio_id, transform_source=js_to_json)
+        song = self._download_json(f'{self._API_BASE}/song/{audio_id}', audio_id)
+        if traverse_obj(song, 'code') != 0:
+            msg = traverse_obj(song, ('msg', {str}, filter))
+            raise ExtractorError(
+                msg or 'API returned an error response', expected=bool(msg))
+
+        album = None
+        if album_id := traverse_obj(song, ('data', 'albumCid', {str})):
+            album = self._download_json(
+                f'{self._API_BASE}/album/{album_id}/detail', album_id, fatal=False)

        return {
            'id': audio_id,
-            'title': traverse_obj(json_data, ('player', 'songDetail', 'name')),
-            'url': traverse_obj(json_data, ('player', 'songDetail', 'sourceUrl')),
-            'ext': 'wav',
            'vcodec': 'none',
-            'artists': traverse_obj(json_data, ('player', 'songDetail', 'artists', ...)),
-            'album': traverse_obj(json_data, ('musicPlay', 'albumDetail', 'name')),
+            **traverse_obj(song, ('data', {
+                'title': ('name', {str}),
+                'artists': ('artists', ..., {str}),
+                'subtitles': ({'url': 'lyricUrl'}, all, {subs_list_to_dict(lang='en')}),
+                'url': ('sourceUrl', {url_or_none}),
+            })),
+            **traverse_obj(album, ('data', {
+                'album': ('name', {str}),
+                'description': ('intro', {clean_html}),
+                'thumbnail': ('coverUrl', {url_or_none}),
+            })),
        }
--- a/yt_dlp/utils/jslib/init.py
+++ b/yt_dlp/utils/jslib/init.py
@@ -0,0 +1 @@
+# Utility functions for handling web input based on commonly used JavaScript libraries
--- a/yt_dlp/utils/jslib/devalue.py
+++ b/yt_dlp/utils/jslib/devalue.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import array
+import base64
+import datetime as dt
+import math
+import re
+
+from .._utils import parse_iso8601
+
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    import collections.abc
+    import typing
+
+    T = typing.TypeVar('T')
+
+
+_ARRAY_TYPE_LOOKUP = {
+    'Int8Array': 'b',
+    'Uint8Array': 'B',
+    'Uint8ClampedArray': 'B',
+    'Int16Array': 'h',
+    'Uint16Array': 'H',
+    'Int32Array': 'i',
+    'Uint32Array': 'I',
+    'Float32Array': 'f',
+    'Float64Array': 'd',
+    'BigInt64Array': 'l',
+    'BigUint64Array': 'L',
+    'ArrayBuffer': 'B',
+}
+
+
+def parse_iter(parsed: typing.Any, /, *, revivers: dict[str, collections.abc.Callable[[list], typing.Any]] | None = None):
+    # based on https://github.com/Rich-Harris/devalue/blob/f3fd2aa93d79f21746555671f955a897335edb1b/src/parse.js
+    resolved = {
+        -1: None,
+        -2: None,
+        -3: math.nan,
+        -4: math.inf,
+        -5: -math.inf,
+        -6: -0.0,
+    }
+
+    if isinstance(parsed, int) and not isinstance(parsed, bool):
+        if parsed not in resolved or parsed == -2:
+            raise ValueError('invalid integer input')
+        return resolved[parsed]
+    elif not isinstance(parsed, list):
+        raise ValueError('expected int or list as input')
+    elif not parsed:
+        raise ValueError('expected a non-empty list as input')
+
+    if revivers is None:
+        revivers = {}
+    return_value = [None]
+    stack: list[tuple] = [(return_value, 0, 0)]
+
+    while stack:
+        target, index, source = stack.pop()
+        if isinstance(source, tuple):
+            name, source, reviver = source
+            try:
+                resolved[source] = target[index] = reviver(target[index])
+            except Exception as error:
+                yield TypeError(f'failed to parse {source} as {name!r}: {error}')
+                resolved[source] = target[index] = None
+            continue
+
+        if source in resolved:
+            target[index] = resolved[source]
+            continue
+
+        # guard against Python negative indexing
+        if source < 0:
+            yield IndexError(f'invalid index: {source!r}')
+            continue
+
+        try:
+            value = parsed[source]
+        except IndexError as error:
+            yield error
+            continue
+
+        if isinstance(value, list):
+            if value and isinstance(value[0], str):
+                # TODO: implement zips `strict=True`
+                if reviver := revivers.get(value[0]):
+                    if value[1] == source:
+                        # XXX: avoid infinite loop
+                        yield IndexError(f'{value[0]!r} cannot point to itself (index: {source})')
+                        continue
+                    # inverse order: resolve index, revive value
+                    stack.append((target, index, (value[0], value[1], reviver)))
+                    stack.append((target, index, value[1]))
+                    continue
+
+                elif value[0] == 'Date':
+                    try:
+                        result = dt.datetime.fromtimestamp(parse_iso8601(value[1]), tz=dt.timezone.utc)
+                    except Exception:
+                        yield ValueError(f'invalid date: {value[1]!r}')
+                        result = None
+
+                elif value[0] == 'Set':
+                    result = [None] * (len(value) - 1)
+                    for offset, new_source in enumerate(value[1:]):
+                        stack.append((result, offset, new_source))
+
+                elif value[0] == 'Map':
+                    result = []
+                    for key, new_source in zip(*(iter(value[1:]),) * 2):
+                        pair = [None, None]
+                        stack.append((pair, 0, key))
+                        stack.append((pair, 1, new_source))
+                        result.append(pair)
+
+                elif value[0] == 'RegExp':
+                    # XXX: use jsinterp to translate regex flags
+                    #      currently ignores `value[2]`
+                    result = re.compile(value[1])
+
+                elif value[0] == 'Object':
+                    result = value[1]
+
+                elif value[0] == 'BigInt':
+                    result = int(value[1])
+
+                elif value[0] == 'null':
+                    result = {}
+                    for key, new_source in zip(*(iter(value[1:]),) * 2):
+                        stack.append((result, key, new_source))
+
+                elif value[0] in _ARRAY_TYPE_LOOKUP:
+                    typecode = _ARRAY_TYPE_LOOKUP[value[0]]
+                    data = base64.b64decode(value[1])
+                    result = array.array(typecode, data).tolist()
+
+                else:
+                    yield TypeError(f'invalid type at {source}: {value[0]!r}')
+                    result = None
+            else:
+                result = len(value) * [None]
+                for offset, new_source in enumerate(value):
+                    stack.append((result, offset, new_source))
+
+        elif isinstance(value, dict):
+            result = {}
+            for key, new_source in value.items():
+                stack.append((result, key, new_source))
+
+        else:
+            result = value
+
+        target[index] = resolved[source] = result
+
+    return return_value[0]
+
+
+def parse(parsed: typing.Any, /, *, revivers: dict[str, collections.abc.Callable[[typing.Any], typing.Any]] | None = None):
+    generator = parse_iter(parsed, revivers=revivers)
+    while True:
+        try:
+            raise generator.send(None)
+        except StopIteration as error:
+            return error.value
				`@@ -0,0 +1 @@`
				`# Utility functions for handling web input based on commonly used JavaScript libraries`