[ie/niconico:live] Fix extractor and downloader (#13158)

Authored by: doe1080
2025-06-27 17:08:32 +00:00 · 2025-06-27 02:45:03 +09:00 · 2025-06-27 02:45:03 +09:00 · 06c1a8cdff
commit 06c1a8cdff
parent 99b85ac102
3 changed files with 66 additions and 69 deletions
--- a/yt_dlp/downloader/niconico.py
+++ b/yt_dlp/downloader/niconico.py
@ -5,47 +5,46 @@
 from .common import FileDownloader
 from .external import FFmpegFD
 from ..networking import Request
-from ..utils import DownloadError, str_or_none, try_get
+from ..networking.websocket import WebSocketResponse
 from ..utils import DownloadError, str_or_none, truncate_string
 from ..utils.traversal import traverse_obj
 class NiconicoLiveFD(FileDownloader):
    """ Downloads niconico live without being stopped """
    def real_download(self, filename, info_dict):
-        video_id = info_dict['video_id']
+        video_id = info_dict['id']
-        ws_url = info_dict['url']
+        opts = info_dict['downloader_options']
-        ws_extractor = info_dict['ws']
+        quality, ws_extractor, ws_url = opts['max_quality'], opts['ws'], opts['ws_url']
        ws_origin_host = info_dict['origin']
        live_quality = info_dict.get('live_quality', 'high')
        live_latency = info_dict.get('live_latency', 'high')
        dl = FFmpegFD(self.ydl, self.params or {})
        new_info_dict = info_dict.copy()
-        new_info_dict.update({
+        new_info_dict['protocol'] = 'm3u8'
            'protocol': 'm3u8',
        })
        def communicate_ws(reconnect):
-            if reconnect:
+            # Support --load-info-json as if it is a reconnect attempt
-                ws = self.ydl.urlopen(Request(ws_url, headers={'Origin': f'https://{ws_origin_host}'}))
+            if reconnect or not isinstance(ws_extractor, WebSocketResponse):
                ws = self.ydl.urlopen(Request(
                    ws_url, headers={'Origin': 'https://live.nicovideo.jp'}))
                if self.ydl.params.get('verbose', False):
-                    self.to_screen('[debug] Sending startWatching request')
+                    self.write_debug('Sending startWatching request')
                ws.send(json.dumps({
                    'type': 'startWatching',
                    'data': {
                        'reconnect': True,
                        'room': {
                            'commentable': True,
                            'protocol': 'webSocket',
                        },
                        'stream': {
                            'quality': live_quality,
                            'protocol': 'hls+fmp4',
                            'latency': live_latency,
                            'accessRightMethod': 'single_cookie',
                            'chasePlay': False,
                            'latency': 'high',
                            'protocol': 'hls',
                            'quality': quality,
                        },
                        'room': {
                            'protocol': 'webSocket',
                            'commentable': True,
                        },
                        'reconnect': True,
                    },
                    'type': 'startWatching',
                }))
            else:
                ws = ws_extractor
@ -58,7 +57,6 @@ def communicate_ws(reconnect):
                    if not data or not isinstance(data, dict):
                        continue
                    if data.get('type') == 'ping':
                        # pong back
                        ws.send(r'{"type":"pong"}')
                        ws.send(r'{"type":"keepSeat"}')
                    elif data.get('type') == 'disconnect':
@ -66,12 +64,10 @@ def communicate_ws(reconnect):
                        return True
                    elif data.get('type') == 'error':
                        self.write_debug(data)
-                        message = try_get(data, lambda x: x['body']['code'], str) or recv
+                        message = traverse_obj(data, ('body', 'code', {str_or_none}), default=recv)
                        return DownloadError(message)
                    elif self.ydl.params.get('verbose', False):
-                        if len(recv) > 100:
+                        self.write_debug(f'Server response: {truncate_string(recv, 100)}')
                            recv = recv[:100] + '...'
                        self.to_screen(f'[debug] Server said: {recv}')
        def ws_main():
            reconnect = False
@ -81,7 +77,8 @@ def ws_main():
                    if ret is True:
                        return
                except BaseException as e:
-                    self.to_screen('[{}] {}: Connection error occured, reconnecting after 10 seconds: {}'.format('niconico:live', video_id, str_or_none(e)))
+                    self.to_screen(
                        f'[niconico:live] {video_id}: Connection error occured, reconnecting after 10 seconds: {e}')
                    time.sleep(10)
                    continue
                finally:
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -263,6 +263,9 @@ class InfoExtractor:
                                 * http_chunk_size Chunk size for HTTP downloads
                                 * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
                                 * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
                                 * ws              (NiconicoLiveFD only) WebSocketResponse
                                 * ws_url          (NiconicoLiveFD only) Websockets URL
                                 * max_quality     (NiconicoLiveFD only) Max stream quality string
                    * is_dash_periods  Whether the format is a result of merging
                                 multiple DASH periods.
                    RTMP formats can also have the additional fields: page_url,
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@ -4,16 +4,15 @@
 import json
 import re
 import time
 import urllib.parse
 from .common import InfoExtractor, SearchInfoExtractor
 from ..networking import Request
 from ..networking.exceptions import HTTPError
 from ..utils import (
    ExtractorError,
    OnDemandPagedList,
    clean_html,
    determine_ext,
    extract_attributes,
    float_or_none,
    int_or_none,
    parse_bitrate,
@ -22,9 +21,8 @@
    parse_qs,
    parse_resolution,
    qualities,
    remove_start,
    str_or_none,
-    unescapeHTML,
+    truncate_string,
    unified_timestamp,
    update_url_query,
    url_basename,
@ -32,7 +30,11 @@
    urlencode_postdata,
    urljoin,
 )
-from ..utils.traversal import find_element, require, traverse_obj
+from ..utils.traversal import (
    find_element,
    require,
    traverse_obj,
 )
 class NiconicoBaseIE(InfoExtractor):
@ -806,41 +808,39 @@ class NiconicoLiveIE(NiconicoBaseIE):
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id)
+        webpage = self._download_webpage(url, video_id, expected_status=404)
        if err_msg := traverse_obj(webpage, ({find_element(cls='message')}, {clean_html})):
            raise ExtractorError(err_msg, expected=True)
-        embedded_data = self._parse_json(unescapeHTML(self._search_regex(
+        embedded_data = traverse_obj(webpage, (
-            r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id)
+            {find_element(tag='script', id='embedded-data', html=True)},
-
+            {extract_attributes}, 'data-props', {json.loads}))
-        ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl'))
+        frontend_id = traverse_obj(embedded_data, ('site', 'frontendId', {str_or_none}), default='9')
        if not ws_url:
            raise ExtractorError('The live hasn\'t started yet or already ended.', expected=True)
        ws_url = update_url_query(ws_url, {
            'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9',
        })
        hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.')
        ws_url = traverse_obj(embedded_data, (
            'site', 'relive', 'webSocketUrl', {url_or_none}, {require('websocket URL')}))
        ws_url = update_url_query(ws_url, {'frontend_id': frontend_id})
        ws = self._request_webpage(
-            Request(ws_url, headers={'Origin': f'https://{hostname}'}),
+            ws_url, video_id, 'Connecting to WebSocket server',
-            video_id=video_id, note='Connecting to WebSocket server')
+            headers={'Origin': 'https://live.nicovideo.jp'})
        self.write_debug('Sending HLS server request')
        ws.send(json.dumps({
            'type': 'startWatching',
            'data': {
                'reconnect': False,
                'room': {
                    'commentable': True,
                    'protocol': 'webSocket',
                },
                'stream': {
                    'quality': 'abr',
                    'protocol': 'hls',
                    'latency': 'high',
                    'accessRightMethod': 'single_cookie',
                    'chasePlay': False,
                    'latency': 'high',
                    'protocol': 'hls',
                    'quality': 'abr',
                },
                'room': {
                    'protocol': 'webSocket',
                    'commentable': True,
                },
                'reconnect': False,
            },
            'type': 'startWatching',
        }))
        while True:
@ -860,17 +860,15 @@ def _real_extract(self, url):
                raise ExtractorError('Disconnected at middle of extraction')
            elif data.get('type') == 'error':
                self.write_debug(recv)
-                message = traverse_obj(data, ('body', 'code')) or recv
+                message = traverse_obj(data, ('body', 'code', {str_or_none}), default=recv)
                raise ExtractorError(message)
            elif self.get_param('verbose', False):
-                if len(recv) > 100:
+                self.write_debug(f'Server response: {truncate_string(recv, 100)}')
                    recv = recv[:100] + '...'
                self.write_debug(f'Server said: {recv}')
        title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta(
            ('og:title', 'twitter:title'), webpage, 'live title', fatal=False)
-        raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail')) or {}
+        raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail', {dict})) or {}
        thumbnails = []
        for name, value in raw_thumbs.items():
            if not isinstance(value, dict):
@ -897,31 +895,30 @@ def _real_extract(self, url):
                cookie['domain'], cookie['name'], cookie['value'],
                expire_time=unified_timestamp(cookie.get('expires')), path=cookie['path'], secure=cookie['secure'])
        fmt_common = {
            'live_latency': 'high',
            'origin': hostname,
            'protocol': 'niconico_live',
            'video_id': video_id,
            'ws': ws,
        }
        q_iter = (q for q in qualities[1:] if not q.startswith('audio_'))  # ignore initial 'abr'
        a_map = {96: 'audio_low', 192: 'audio_high'}
        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True)
        for fmt in formats:
            fmt['protocol'] = 'niconico_live'
            if fmt.get('acodec') == 'none':
                fmt['format_id'] = next(q_iter, fmt['format_id'])
            elif fmt.get('vcodec') == 'none':
                abr = parse_bitrate(fmt['url'].lower())
                fmt.update({
                    'abr': abr,
                    'acodec': 'mp4a.40.2',
                    'format_id': a_map.get(abr, fmt['format_id']),
                })
            fmt.update(fmt_common)
        return {
            'id': video_id,
            'title': title,
            'downloader_options': {
                'max_quality': traverse_obj(embedded_data, ('program', 'stream', 'maxQuality', {str})) or 'normal',
                'ws': ws,
                'ws_url': ws_url,
            },
            **traverse_obj(embedded_data, {
                'view_count': ('program', 'statistics', 'watchCount'),
                'comment_count': ('program', 'statistics', 'commentCount'),