From d22436e5dc7c6808d931e27cbb967b1b2a33c17c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 29 Dec 2025 15:46:29 -0600 Subject: [PATCH] [ie/youtube] Support comment subthreads (#15419) * Support newly rolled out comment "subthreads" * Fix comments extraction: all replies were being missed * Add a `max-depth` element to the `max_comments` extractor-arg * Fully remove the deprecated `max_comment_depth` extractor-arg Closes #15303 Authored by: bashonly --- README.md | 5 +- yt_dlp/extractor/youtube/_base.py | 2 +- yt_dlp/extractor/youtube/_video.py | 75 +++++++++++++++++++++++++----- 3 files changed, 68 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 10a2290c5d..41868cb4a9 100644 --- a/README.md +++ b/README.md @@ -1859,8 +1859,9 @@ The following extractors use this feature: * `player_js_variant`: The player javascript variant to use for n/sig deciphering. The known variants are: `main`, `tcc`, `tce`, `es5`, `es6`, `tv`, `tv_es6`, `phone`, `tablet`. The default is `main`, and the others are for debugging purposes. You can use `actual` to go with what is prescribed by the site * `player_js_version`: The player javascript version to use for n/sig deciphering, in the format of `signature_timestamp@hash` (e.g. `20348@0004de42`). The default is to use what is prescribed by the site, and can be selected with `actual` * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) -* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` - * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total +* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread,max-depth`. Default is `all,all,all,all,all` + * A `max-depth` value of `1` will discard all replies, regardless of the `max-replies` or `max-replies-per-thread` values given + * E.g. `all,all,1000,10,2` will get a maximum of 1000 replies total, with up to 10 replies per thread, and only 2 levels of depth (i.e. top-level comments plus their immediate replies). `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total * `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8), `missing_pot` (include formats that require a PO Token but are missing one) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index 114eee821b..26925090a7 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -1065,7 +1065,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return next_continuation return traverse_obj(renderer, ( - ('contents', 'items', 'rows'), ..., 'continuationItemRenderer', + ('contents', 'items', 'rows', 'subThreads'), ..., 'continuationItemRenderer', ('continuationEndpoint', ('button', 'buttonRenderer', 'command')), ), get_all=False, expected_type=cls._extract_continuation_ep_data) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index edf71c9faa..6961829fe1 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -1660,6 +1660,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', }, 'params': {'skip_download': True}, + }, { + # Threaded comments with 4 levels of depth + 'url': 'https://www.youtube.com/watch?v=f6HNySwZV4c', + 'info_dict': { + 'id': 'f6HNySwZV4c', + 'ext': 'mp4', + 'title': 'dlptestvideo2', + 'description': '', + 'media_type': 'video', + 'uploader': 'cole-dlp-test-acc', + 'uploader_id': '@coletdjnz', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'duration': 5, + 'thumbnail': 'https://i.ytimg.com/vi/f6HNySwZV4c/maxresdefault.jpg', + 'categories': ['People & Blogs'], + 'tags': [], + 'timestamp': 1709856007, + 'upload_date': '20240308', + 'release_timestamp': 1709856007, + 'release_date': '20240308', + 'playable_in_embed': True, + 'availability': 'public', + 'live_status': 'not_live', + 'comment_count': 15, + }, + 'params': { + 'skip_download': True, + 'getcomments': True, + }, }] _WEBPAGE_TESTS = [{ # @@ -2437,6 +2472,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def extract_thread(contents, entity_payloads): if not parent: tracker['current_page_thread'] = 0 + + if max_depth < tracker['current_depth']: + return + for content in contents: if not parent and tracker['total_parent_comments'] >= max_parents: yield @@ -2480,6 +2519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Detected YouTube comments looping. Stopping comment extraction ' f'{"for this thread" if parent else ""} as we probably cannot get any more.') yield + break # Safeguard for recursive call in subthreads code path below else: tracker['seen_comment_ids'].add(comment['id']) @@ -2492,12 +2532,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict) if comment_replies_renderer: + subthreads = traverse_obj(comment_replies_renderer, ( + 'subThreads', lambda _, v: v['commentThreadRenderer'])) + # Recursively extract from `commentThreadRenderer`s in `subThreads` + if subthreads: + tracker['current_depth'] += 1 + for entry in extract_thread(subthreads, entity_payloads): + if entry: + yield entry + tracker['current_depth'] -= 1 + # All of the subThreads' `continuationItemRenderer`s were within the nested + # `commentThreadRenderer`s and are now exhausted, so avoid unnecessary recursion below + continue + tracker['current_page_thread'] += 1 + tracker['current_depth'] += 1 + # Recursively extract from `continuationItemRenderer`s in `subThreads` comment_entries_iter = self._comment_entries( comment_replies_renderer, ytcfg, video_id, - parent=comment.get('id'), tracker=tracker) + parent=comment_id, tracker=tracker) yield from itertools.islice(comment_entries_iter, min( max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments']))) + tracker['current_depth'] -= 1 # Keeps track of counts across recursive calls if not tracker: @@ -2509,19 +2565,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'total_reply_comments': 0, 'seen_comment_ids': set(), 'pinned_comment_ids': set(), + 'current_depth': 1, } - # TODO: Deprecated - # YouTube comments have a max depth of 2 - max_depth = int_or_none(get_single_config_arg('max_comment_depth')) - if max_depth: - self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. ' - 'Set max replies in the max-comments extractor argument instead') - if max_depth == 1 and parent: - return + _max_comments, max_parents, max_replies, max_replies_per_thread, max_depth, *_ = ( + int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 5) - _max_comments, max_parents, max_replies, max_replies_per_thread, *_ = ( - int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 4) + if max_depth < tracker['current_depth']: + return continuation = self._extract_continuation(root_continuation_data) @@ -2550,6 +2601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): note_prefix = ' Downloading comment API JSON reply thread %d %s' % ( tracker['current_page_thread'], comment_prog_str) else: + # TODO: `parent` is only truthy in this code path with YT's legacy (non-threaded) comment view note_prefix = '{}Downloading comment{} API JSON page {} {}'.format( ' ' if parent else '', ' replies' if parent else '', page_num, comment_prog_str) @@ -2566,6 +2618,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, check_get_keys=check_get_keys) except ExtractorError as e: + # TODO: This code path is not reached since eb5bdbfa70126c7d5355cc0954b63720522e462c # Ignore incomplete data error for replies if retries didn't work. # This is to allow any other parent comments and comment threads to be downloaded. # See: https://github.com/yt-dlp/yt-dlp/issues/4669