From d22436e5dc7c6808d931e27cbb967b1b2a33c17c Mon Sep 17 00:00:00 2001
From: bashonly <88596187+bashonly@users.noreply.github.com>
Date: Mon, 29 Dec 2025 15:46:29 -0600
Subject: [PATCH] [ie/youtube] Support comment subthreads (#15419)

* Support newly rolled out comment "subthreads"
* Fix comments extraction: all replies were being missed
* Add a `max-depth` element to the `max_comments` extractor-arg
* Fully remove the deprecated `max_comment_depth` extractor-arg

Closes #15303
Authored by: bashonly
---
 README.md                          |  5 +-
 yt_dlp/extractor/youtube/_base.py  |  2 +-
 yt_dlp/extractor/youtube/_video.py | 75 +++++++++++++++++++++++++-----
 3 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 10a2290c5d..41868cb4a9 100644
--- a/README.md
+++ b/README.md
@@ -1859,8 +1859,9 @@ The following extractors use this feature:
 * `player_js_variant`: The player javascript variant to use for n/sig deciphering. The known variants are: `main`, `tcc`, `tce`, `es5`, `es6`, `tv`, `tv_es6`, `phone`, `tablet`. The default is `main`, and the others are for debugging purposes. You can use `actual` to go with what is prescribed by the site
 * `player_js_version`: The player javascript version to use for n/sig deciphering, in the format of `signature_timestamp@hash` (e.g. `20348@0004de42`). The default is to use what is prescribed by the site, and can be selected with `actual`
 * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
-* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
-    * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
+* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread,max-depth`. Default is `all,all,all,all,all`
+    * A `max-depth` value of `1` will discard all replies, regardless of the `max-replies` or `max-replies-per-thread` values given
+    * E.g. `all,all,1000,10,2` will get a maximum of 1000 replies total, with up to 10 replies per thread, and only 2 levels of depth (i.e. top-level comments plus their immediate replies). `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
 * `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8), `missing_pot` (include formats that require a PO Token but are missing one)
 * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
 * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used
diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py
index 114eee821b..26925090a7 100644
--- a/yt_dlp/extractor/youtube/_base.py
+++ b/yt_dlp/extractor/youtube/_base.py
@@ -1065,7 +1065,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             return next_continuation
 
         return traverse_obj(renderer, (
-            ('contents', 'items', 'rows'), ..., 'continuationItemRenderer',
+            ('contents', 'items', 'rows', 'subThreads'), ..., 'continuationItemRenderer',
             ('continuationEndpoint', ('button', 'buttonRenderer', 'command')),
         ), get_all=False, expected_type=cls._extract_continuation_ep_data)
 
diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py
index edf71c9faa..6961829fe1 100644
--- a/yt_dlp/extractor/youtube/_video.py
+++ b/yt_dlp/extractor/youtube/_video.py
@@ -1660,6 +1660,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'live_status': 'not_live',
         },
         'params': {'skip_download': True},
+    }, {
+        # Threaded comments with 4 levels of depth
+        'url': 'https://www.youtube.com/watch?v=f6HNySwZV4c',
+        'info_dict': {
+            'id': 'f6HNySwZV4c',
+            'ext': 'mp4',
+            'title': 'dlptestvideo2',
+            'description': '',
+            'media_type': 'video',
+            'uploader': 'cole-dlp-test-acc',
+            'uploader_id': '@coletdjnz',
+            'uploader_url': 'https://www.youtube.com/@coletdjnz',
+            'channel': 'cole-dlp-test-acc',
+            'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
+            'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 0,
+            'duration': 5,
+            'thumbnail': 'https://i.ytimg.com/vi/f6HNySwZV4c/maxresdefault.jpg',
+            'categories': ['People & Blogs'],
+            'tags': [],
+            'timestamp': 1709856007,
+            'upload_date': '20240308',
+            'release_timestamp': 1709856007,
+            'release_date': '20240308',
+            'playable_in_embed': True,
+            'availability': 'public',
+            'live_status': 'not_live',
+            'comment_count': 15,
+        },
+        'params': {
+            'skip_download': True,
+            'getcomments': True,
+        },
     }]
     _WEBPAGE_TESTS = [{
         # <object>
@@ -2437,6 +2472,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         def extract_thread(contents, entity_payloads):
             if not parent:
                 tracker['current_page_thread'] = 0
+
+            if max_depth < tracker['current_depth']:
+                return
+
             for content in contents:
                 if not parent and tracker['total_parent_comments'] >= max_parents:
                     yield
@@ -2480,6 +2519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         'Detected YouTube comments looping. Stopping comment extraction '
                         f'{"for this thread" if parent else ""} as we probably cannot get any more.')
                     yield
+                    break  # Safeguard for recursive call in subthreads code path below
                 else:
                     tracker['seen_comment_ids'].add(comment['id'])
 
@@ -2492,12 +2532,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
 
                 if comment_replies_renderer:
+                    subthreads = traverse_obj(comment_replies_renderer, (
+                        'subThreads', lambda _, v: v['commentThreadRenderer']))
+                    # Recursively extract from `commentThreadRenderer`s in `subThreads`
+                    if subthreads:
+                        tracker['current_depth'] += 1
+                        for entry in extract_thread(subthreads, entity_payloads):
+                            if entry:
+                                yield entry
+                        tracker['current_depth'] -= 1
+                        # All of the subThreads' `continuationItemRenderer`s were within the nested
+                        # `commentThreadRenderer`s and are now exhausted, so avoid unnecessary recursion below
+                        continue
+
                     tracker['current_page_thread'] += 1
+                    tracker['current_depth'] += 1
+                    # Recursively extract from `continuationItemRenderer`s in `subThreads`
                     comment_entries_iter = self._comment_entries(
                         comment_replies_renderer, ytcfg, video_id,
-                        parent=comment.get('id'), tracker=tracker)
+                        parent=comment_id, tracker=tracker)
                     yield from itertools.islice(comment_entries_iter, min(
                         max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments'])))
+                    tracker['current_depth'] -= 1
 
         # Keeps track of counts across recursive calls
         if not tracker:
@@ -2509,19 +2565,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'total_reply_comments': 0,
                 'seen_comment_ids': set(),
                 'pinned_comment_ids': set(),
+                'current_depth': 1,
             }
 
-        # TODO: Deprecated
-        # YouTube comments have a max depth of 2
-        max_depth = int_or_none(get_single_config_arg('max_comment_depth'))
-        if max_depth:
-            self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. '
-                                                'Set max replies in the max-comments extractor argument instead')
-        if max_depth == 1 and parent:
-            return
+        _max_comments, max_parents, max_replies, max_replies_per_thread, max_depth, *_ = (
+            int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 5)
 
-        _max_comments, max_parents, max_replies, max_replies_per_thread, *_ = (
-            int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 4)
+        if max_depth < tracker['current_depth']:
+            return
 
         continuation = self._extract_continuation(root_continuation_data)
 
@@ -2550,6 +2601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     note_prefix = '    Downloading comment API JSON reply thread %d %s' % (
                         tracker['current_page_thread'], comment_prog_str)
             else:
+                # TODO: `parent` is only truthy in this code path with YT's legacy (non-threaded) comment view
                 note_prefix = '{}Downloading comment{} API JSON page {} {}'.format(
                     '       ' if parent else '', ' replies' if parent else '',
                     page_num, comment_prog_str)
@@ -2566,6 +2618,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
                     check_get_keys=check_get_keys)
             except ExtractorError as e:
+                # TODO: This code path is not reached since eb5bdbfa70126c7d5355cc0954b63720522e462c
                 # Ignore incomplete data error for replies if retries didn't work.
                 # This is to allow any other parent comments and comment threads to be downloaded.
                 # See: https://github.com/yt-dlp/yt-dlp/issues/4669