From ee8133b07779ee15f116daed4a81ee49eb7238ab Mon Sep 17 00:00:00 2001
From: Matteo Abis <1423701+Enucatl@users.noreply.github.com>
Date: Sat, 21 Jun 2025 09:44:21 +0200
Subject: [PATCH] codeql fixes

---
 yt_dlp/extractor/threads.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)
diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py
index 0aa5ca6dc..3f01c0f34 100644
--- a/yt_dlp/extractor/threads.py
+++ b/yt_dlp/extractor/threads.py
@@ -100,6 +100,7 @@ class ThreadsIE(InfoExtractor):
                 'channel_is_verified': False,
                 'channel_url': 'https://www.threads.com/@enucatl',
                 'description': '',
+                'ext': 'mp4',
                 'id': 'DLIrVcmPuFA7g5tn9OzPjsA-R8qU2HPJv_FzCo0',
                 'like_count': int,
                 'timestamp': 1745582191,
@@ -126,13 +127,15 @@ def _real_extract(self, url):
         webpage = self._download_webpage(url, post_id, note='Downloading post page')
 
         json_data = None
-        # Match single scripts
-        for script in re.findall(r'<script[^>]*>(.*?)</script>', webpage, re.DOTALL | re.IGNORECASE):
-            # Heuristic check: if the script doesn't contain "RelayPrefetchedStreamCache" and the post_id,
-            # it's definitely not the one we want. Skip it quickly.
-            if 'RelayPrefetchedStreamCache' not in script or post_id not in script:
-                continue
 
+        json_scripts = re.findall(
+            r'<script type="application/json"[^>]*?\sdata-sjs[^>]*?>(.*?)<\s*/script\s*>',
+            webpage,
+            re.DOTALL | re.IGNORECASE,
+        )
+        for script in json_scripts:
+            if post_id not in script or 'RelayPrefetchedStreamCache' not in script:
+                continue
             # This script is a candidate. Try to parse it.
             # We use fatal=False because we expect some candidates to fail parsing.
             candidate_json = self._search_json(r'"result":', script, 'result data', post_id, fatal=False)
@@ -174,16 +177,17 @@ def _real_extract(self, url):
 
         # This metadata applies to the whole post (the playlist).
         uploader = traverse_obj(main_post, ('user', 'username'))
+        caption = traverse_obj(main_post, ('caption', 'text'))
         title = (
-            strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads'))
-            or traverse_obj(main_post, ('caption', 'text'))
+            caption
+            or strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads'))
             or f'Post by {uploader}'
         )
 
         playlist_metadata = {
             'id': post_id,
             'title': title,
-            'description': self._og_search_description(webpage) or traverse_obj(main_post, ('caption', 'text')),
+            'description': caption or self._og_search_description(webpage),
             'uploader': uploader,
             'uploader_id': traverse_obj(main_post, ('user', 'pk')),
             'uploader_url': f'https://www.threads.com/@{uploader}',