From b2b90f6697344a3c8129e88cd1beba370f292057 Mon Sep 17 00:00:00 2001
From: Matteo Abis <1423701+Enucatl@users.noreply.github.com>
Date: Fri, 20 Jun 2025 23:16:15 +0200
Subject: [PATCH 1/4] add threads extractor

---
 yt_dlp/extractor/_extractors.py |   4 +
 yt_dlp/extractor/threads.py     | 278 ++++++++++++++++++++++++++++++++
 2 files changed, 282 insertions(+)
 create mode 100644 yt_dlp/extractor/threads.py
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
index 34c98b537..2ba966565 100644
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -2116,6 +2116,10 @@
     ThisVidMemberIE,
     ThisVidPlaylistIE,
 )
+from .threads import (
+    ThreadsIE,
+    ThreadsIOSIE,
+)
 from .threeqsdn import ThreeQSDNIE
 from .threespeak import (
     ThreeSpeakIE,
diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py
new file mode 100644
index 000000000..bc007583f
--- /dev/null
+++ b/yt_dlp/extractor/threads.py
@@ -0,0 +1,278 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    remove_end,
+    strip_or_none,
+    traverse_obj,
+)
+
+
+class ThreadsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?threads\.(?:net|com)/(?:@[^/]+/)?(?:post|t)/(?P<id>[^/?#&]+)'
+    _NETRC_MACHINE = 'threads'
+    _TESTS = [
+        {
+            'note': 'Post with single video, with username and post',
+            'url': 'https://www.threads.com/@zuck/post/DHV7vTivqWD',
+            'info_dict': {
+                'channel': 'zuck',
+                'channel_is_verified': True,
+                'channel_url': 'https://www.threads.com/@zuck',
+                'description': 'Me finding out Llama hit 1 BILLION downloads.',
+                'ext': 'mp4',
+                'id': 'DHV7vTivqWD',
+                'like_count': int,
+                'thumbnail': str,
+                'timestamp': 1742305717,
+                'title': 'Me finding out Llama hit 1 BILLION downloads.',
+                'upload_date': '20250318',
+                'uploader': 'zuck',
+                'uploader_id': '63055343223',
+                'uploader_url': 'https://www.threads.com/@zuck',
+            },
+        },
+        {
+            'note': 'Post with single video, without username and with t',
+            'url': 'https://www.threads.com/t/DHV7vTivqWD',
+            'info_dict': {
+                'channel': 'zuck',
+                'channel_is_verified': True,
+                'channel_url': 'https://www.threads.com/@zuck',
+                'description': 'Me finding out Llama hit 1 BILLION downloads.',
+                'ext': 'mp4',
+                'id': 'DHV7vTivqWD',
+                'like_count': int,
+                'thumbnail': str,
+                'timestamp': 1742305717,
+                'title': 'Me finding out Llama hit 1 BILLION downloads.',
+                'upload_date': '20250318',
+                'uploader': 'zuck',
+                'uploader_id': '63055343223',
+                'uploader_url': 'https://www.threads.com/@zuck',
+            },
+        },
+        {
+            'note': 'Post with carousel 2 images and 1 video',
+            'url': 'https://www.threads.com/@zuck/post/DJDhoQfxb43',
+            'info_dict': {
+                'channel': 'zuck',
+                'channel_is_verified': True,
+                'channel_url': 'https://www.threads.com/@zuck',
+                'description': 'md5:9146c2c42fd53aba9090f61ccfd64fc8',
+                'id': 'DJDhoQfxb43',
+                'like_count': int,
+                'timestamp': 1745982529,
+                'title': 'md5:9146c2c42fd53aba9090f61ccfd64fc8',
+                'upload_date': '20250430',
+                'uploader': 'zuck',
+                'uploader_id': '63055343223',
+                'uploader_url': 'https://www.threads.com/@zuck',
+            },
+            'playlist_count': 3,
+        },
+        {
+            'note': 'Post with 1 image',
+            'url': 'https://www.threads.com/@zuck/post/DI3mC0GxkYA',
+            'info_dict': {
+                'channel': 'zuck',
+                'channel_is_verified': True,
+                'channel_url': 'https://www.threads.com/@zuck',
+                'description': 'md5:e292006574f5deb5552c1ad677cee8dd',
+                'ext': 'webp',
+                'id': 'DI3mC0GxkYA',
+                'like_count': int,
+                'timestamp': 1745582191,
+                'title': 'md5:e292006574f5deb5552c1ad677cee8dd',
+                'upload_date': '20250425',
+                'uploader': 'zuck',
+                'uploader_id': '63055343223',
+                'uploader_url': 'https://www.threads.com/@zuck',
+            },
+        },
+        {
+            'note': 'Private Post',
+            'url': 'https://www.threads.com/@enucatl/post/DLIrVcmPuFA7g5tn9OzPjsA-R8qU2HPJv_FzCo0',
+            'info_dict': {
+                'channel': 'enucatl',
+                'channel_is_verified': False,
+                'channel_url': 'https://www.threads.com/@enucatl',
+                'description': '',
+                'id': 'DLIrVcmPuFA7g5tn9OzPjsA-R8qU2HPJv_FzCo0',
+                'like_count': int,
+                'timestamp': 1745582191,
+                'title': '',
+                'upload_date': '20250620',
+                'uploader': 'enucatl',
+                'uploader_id': '63055343223',
+                'uploader_url': 'https://www.threads.com/@enucatl',
+            },
+            'skip': 'private account, requires authentication',
+        },
+    ]
+
+    def _perform_login(self, username, password):
+        # We are not implementing direct login. Cookies are preferred.
+        self.raise_login_required(
+            'Login with username/password is not supported. '
+            'Use --cookies or --cookies-from-browser to provide authentication.',
+            method='cookies',
+        )
+
+    def _real_extract(self, url):
+        post_id = self._match_id(url)
+        webpage = self._download_webpage(url, post_id, note='Downloading post page')
+
+        json_data = None
+        # Match single scripts
+        for script in re.findall(r'<script[^>]*>(.*?)</script>', webpage, re.DOTALL):
+            # Heuristic check: if the script doesn't contain "RelayPrefetchedStreamCache" and the post_id,
+            # it's definitely not the one we want. Skip it quickly.
+            if 'RelayPrefetchedStreamCache' not in script or post_id not in script:
+                continue
+
+            # This script is a candidate. Try to parse it.
+            # We use fatal=False because we expect some candidates to fail parsing.
+            candidate_json = self._search_json(r'"result":', script, 'result data', post_id, fatal=False)
+
+            if not candidate_json:
+                continue
+
+            post_data = traverse_obj(
+                candidate_json,
+                (
+                    'data',
+                    'data',
+                    'edges',
+                ),
+            )
+
+            if post_data is not None:
+                json_data = post_data
+                break
+
+        if not json_data:
+            self.raise_no_formats(
+                'Could not extract post data. The post may be private or deleted. You may need to log in.',
+                expected=True,
+            )
+
+        main_post = None
+        for node in json_data:
+            for item in traverse_obj(node, ('node', 'thread_items'), default=[]):
+                post_candidate = item.get('post')
+                if traverse_obj(post_candidate, 'code') == post_id:
+                    main_post = post_candidate
+                    break
+            if main_post:
+                break
+
+        if not main_post:
+            self.raise_no_formats('Could not find post data matching the post ID.', expected=True)
+
+        # This metadata applies to the whole post (the playlist).
+        uploader = traverse_obj(main_post, ('user', 'username'))
+        title = (
+            strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads'))
+            or traverse_obj(main_post, ('caption', 'text'))
+            or f'Post by {uploader}'
+        )
+
+        playlist_metadata = {
+            'id': post_id,
+            'title': title,
+            'description': self._og_search_description(webpage) or traverse_obj(main_post, ('caption', 'text')),
+            'uploader': uploader,
+            'uploader_id': traverse_obj(main_post, ('user', 'pk')),
+            'uploader_url': f'https://www.threads.com/@{uploader}',
+            'channel': uploader,
+            'channel_url': f'https://www.threads.com/@{uploader}',
+            'channel_is_verified': traverse_obj(main_post, ('user', 'is_verified')),
+            'timestamp': int_or_none(main_post.get('taken_at')),
+            'like_count': int_or_none(main_post.get('like_count')),
+        }
+
+        media_list = main_post.get('carousel_media') or [main_post]
+        playlist_entries = []
+
+        for i, media in enumerate(media_list):
+            entry_id = f'{post_id}_{i + 1}' if len(media_list) > 1 else post_id
+
+            # --- VIDEO ---
+            if media.get('video_versions'):
+                formats = []
+                for video in media.get('video_versions'):
+                    formats.append({
+                        # 'format_id' is optional, yt-dlp can generate it
+                        'url': video.get('url'),
+                        'width': int_or_none(video.get('width')),
+                        'height': int_or_none(video.get('height')),
+                    })
+
+                # Create a dictionary for THIS video entry
+                playlist_entries.append({
+                    'id': entry_id,
+                    'title': title,  # The title is shared by all entries
+                    'formats': formats,
+                    'thumbnail': traverse_obj(media, ('image_versions2', 'candidates', 0, 'url')),
+                    # Add any media-specific metadata here
+                })
+                continue  # Move to the next media item
+
+            # --- IMAGE ---
+            image_candidates = traverse_obj(media, ('image_versions2', 'candidates'))
+            if image_candidates:
+                best_image = image_candidates[0]
+                playlist_entries.append({
+                    'id': entry_id,
+                    'title': title,
+                    'url': best_image.get('url'),
+                    'ext': determine_ext(best_image.get('url'), 'jpg'),
+                    'width': int_or_none(best_image.get('width')),
+                    'height': int_or_none(best_image.get('height')),
+                    'vcodec': 'none',  # This tells yt-dlp it's an image
+                })
+
+        if not playlist_entries:
+            self.raise_no_formats('This post contains no downloadable video or images.', expected=True)
+
+        if len(playlist_entries) == 1:
+            return {**playlist_entries[0], **playlist_metadata}
+
+        return self.playlist_result(playlist_entries, **playlist_metadata)
+
+
+class ThreadsIOSIE(InfoExtractor):
+    IE_DESC = 'IOS barcelona:// URL'
+    _VALID_URL = r'barcelona://media\?shortcode=(?P<id>[^/?#&]+)'
+    _TESTS = [
+        {
+            'url': 'barcelona://media?shortcode=C6fDehepo5D',
+            'info_dict': {
+                'channel': 'saopaulofc',
+                'channel_is_verified': bool,
+                'channel_url': 'https://www.threads.com/@saopaulofc',
+                'description': 'md5:0c36a7e67e1517459bc0334dba932164',
+                'ext': 'mp4',
+                'id': 'C6fDehepo5D',
+                'like_count': int,
+                'thumbnail': r're:^https?://.*\.jpg',
+                'timestamp': 1714694014,
+                'title': 'md5:be7fe42330e2e78e969ca30254535d0b',
+                'upload_date': '20240502',
+                'uploader': 'saopaulofc',
+                'uploader_id': '63360239523',
+                'uploader_url': 'https://www.threads.com/@saopaulofc',
+            },
+            'add_ie': ['Threads'],
+        },
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Threads doesn't care about the user url, it redirects to the right one
+        # So we use ** instead so that we don't need to find it
+        return self.url_result(f'https://www.threads.net/t/{video_id}', ThreadsIE, video_id)

From 5386879dd59fa9b83131f42332da2df273ac0a7c Mon Sep 17 00:00:00 2001
From: Matteo Abis <1423701+Enucatl@users.noreply.github.com>
Date: Sat, 21 Jun 2025 08:32:15 +0200
Subject: [PATCH 2/4] allow matching scripts independent of case

---
 yt_dlp/extractor/threads.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py
index bc007583f..0aa5ca6dc 100644
--- a/yt_dlp/extractor/threads.py
+++ b/yt_dlp/extractor/threads.py
@@ -127,7 +127,7 @@ def _real_extract(self, url):
 
         json_data = None
         # Match single scripts
-        for script in re.findall(r'<script[^>]*>(.*?)</script>', webpage, re.DOTALL):
+        for script in re.findall(r'<script[^>]*>(.*?)</script>', webpage, re.DOTALL | re.IGNORECASE):
             # Heuristic check: if the script doesn't contain "RelayPrefetchedStreamCache" and the post_id,
             # it's definitely not the one we want. Skip it quickly.
             if 'RelayPrefetchedStreamCache' not in script or post_id not in script:

From ee8133b07779ee15f116daed4a81ee49eb7238ab Mon Sep 17 00:00:00 2001
From: Matteo Abis <1423701+Enucatl@users.noreply.github.com>
Date: Sat, 21 Jun 2025 09:44:21 +0200
Subject: [PATCH 3/4] codeql fixes

---
 yt_dlp/extractor/threads.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py
index 0aa5ca6dc..3f01c0f34 100644
--- a/yt_dlp/extractor/threads.py
+++ b/yt_dlp/extractor/threads.py
@@ -100,6 +100,7 @@ class ThreadsIE(InfoExtractor):
                 'channel_is_verified': False,
                 'channel_url': 'https://www.threads.com/@enucatl',
                 'description': '',
+                'ext': 'mp4',
                 'id': 'DLIrVcmPuFA7g5tn9OzPjsA-R8qU2HPJv_FzCo0',
                 'like_count': int,
                 'timestamp': 1745582191,
@@ -126,13 +127,15 @@ def _real_extract(self, url):
         webpage = self._download_webpage(url, post_id, note='Downloading post page')
 
         json_data = None
-        # Match single scripts
-        for script in re.findall(r'<script[^>]*>(.*?)</script>', webpage, re.DOTALL | re.IGNORECASE):
-            # Heuristic check: if the script doesn't contain "RelayPrefetchedStreamCache" and the post_id,
-            # it's definitely not the one we want. Skip it quickly.
-            if 'RelayPrefetchedStreamCache' not in script or post_id not in script:
-                continue
 
+        json_scripts = re.findall(
+            r'<script type="application/json"[^>]*?\sdata-sjs[^>]*?>(.*?)<\s*/script\s*>',
+            webpage,
+            re.DOTALL | re.IGNORECASE,
+        )
+        for script in json_scripts:
+            if post_id not in script or 'RelayPrefetchedStreamCache' not in script:
+                continue
             # This script is a candidate. Try to parse it.
             # We use fatal=False because we expect some candidates to fail parsing.
             candidate_json = self._search_json(r'"result":', script, 'result data', post_id, fatal=False)
@@ -174,16 +177,17 @@ def _real_extract(self, url):
 
         # This metadata applies to the whole post (the playlist).
         uploader = traverse_obj(main_post, ('user', 'username'))
+        caption = traverse_obj(main_post, ('caption', 'text'))
         title = (
-            strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads'))
-            or traverse_obj(main_post, ('caption', 'text'))
+            caption
+            or strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads'))
             or f'Post by {uploader}'
         )
 
         playlist_metadata = {
             'id': post_id,
             'title': title,
-            'description': self._og_search_description(webpage) or traverse_obj(main_post, ('caption', 'text')),
+            'description': caption or self._og_search_description(webpage),
             'uploader': uploader,
             'uploader_id': traverse_obj(main_post, ('user', 'pk')),
             'uploader_url': f'https://www.threads.com/@{uploader}',

From c050154293ffb876549bc4f67b5f8202b0bc8f87 Mon Sep 17 00:00:00 2001
From: Matteo Abis <1423701+Enucatl@users.noreply.github.com>
Date: Sat, 21 Jun 2025 13:09:31 +0200
Subject: [PATCH 4/4] add threads to supportedsites

---
 supportedsites.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/supportedsites.md b/supportedsites.md
index 1fe381603..7414a8d77 100644
--- a/supportedsites.md
+++ b/supportedsites.md
@@ -1491,6 +1491,7 @@ # Supported sites
  - **ThisVid**
  - **ThisVidMember**
  - **ThisVidPlaylist**
+ - **Threads**
  - **ThreeSpeak**
  - **ThreeSpeakUser**
  - **TikTok**