From b2b90f6697344a3c8129e88cd1beba370f292057 Mon Sep 17 00:00:00 2001 From: Matteo Abis <1423701+Enucatl@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:16:15 +0200 Subject: [PATCH 1/4] add threads extractor --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/threads.py | 278 ++++++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 yt_dlp/extractor/threads.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 34c98b537..2ba966565 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2116,6 +2116,10 @@ ThisVidMemberIE, ThisVidPlaylistIE, ) +from .threads import ( + ThreadsIE, + ThreadsIOSIE, +) from .threeqsdn import ThreeQSDNIE from .threespeak import ( ThreeSpeakIE, diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py new file mode 100644 index 000000000..bc007583f --- /dev/null +++ b/yt_dlp/extractor/threads.py @@ -0,0 +1,278 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + remove_end, + strip_or_none, + traverse_obj, +) + + +class ThreadsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?threads\.(?:net|com)/(?:@[^/]+/)?(?:post|t)/(?P[^/?#&]+)' + _NETRC_MACHINE = 'threads' + _TESTS = [ + { + 'note': 'Post with single video, with username and post', + 'url': 'https://www.threads.com/@zuck/post/DHV7vTivqWD', + 'info_dict': { + 'channel': 'zuck', + 'channel_is_verified': True, + 'channel_url': 'https://www.threads.com/@zuck', + 'description': 'Me finding out Llama hit 1 BILLION downloads.', + 'ext': 'mp4', + 'id': 'DHV7vTivqWD', + 'like_count': int, + 'thumbnail': str, + 'timestamp': 1742305717, + 'title': 'Me finding out Llama hit 1 BILLION downloads.', + 'upload_date': '20250318', + 'uploader': 'zuck', + 'uploader_id': '63055343223', + 'uploader_url': 'https://www.threads.com/@zuck', + }, + }, + { + 'note': 'Post with single video, without username and with t', + 'url': 'https://www.threads.com/t/DHV7vTivqWD', + 'info_dict': { + 'channel': 'zuck', + 'channel_is_verified': True, + 'channel_url': 'https://www.threads.com/@zuck', + 'description': 'Me finding out Llama hit 1 BILLION downloads.', + 'ext': 'mp4', + 'id': 'DHV7vTivqWD', + 'like_count': int, + 'thumbnail': str, + 'timestamp': 1742305717, + 'title': 'Me finding out Llama hit 1 BILLION downloads.', + 'upload_date': '20250318', + 'uploader': 'zuck', + 'uploader_id': '63055343223', + 'uploader_url': 'https://www.threads.com/@zuck', + }, + }, + { + 'note': 'Post with carousel 2 images and 1 video', + 'url': 'https://www.threads.com/@zuck/post/DJDhoQfxb43', + 'info_dict': { + 'channel': 'zuck', + 'channel_is_verified': True, + 'channel_url': 'https://www.threads.com/@zuck', + 'description': 'md5:9146c2c42fd53aba9090f61ccfd64fc8', + 'id': 'DJDhoQfxb43', + 'like_count': int, + 'timestamp': 1745982529, + 'title': 'md5:9146c2c42fd53aba9090f61ccfd64fc8', + 'upload_date': '20250430', + 'uploader': 'zuck', + 'uploader_id': '63055343223', + 'uploader_url': 'https://www.threads.com/@zuck', + }, + 'playlist_count': 3, + }, + { + 'note': 'Post with 1 image', + 'url': 'https://www.threads.com/@zuck/post/DI3mC0GxkYA', + 'info_dict': { + 'channel': 'zuck', + 'channel_is_verified': True, + 'channel_url': 'https://www.threads.com/@zuck', + 'description': 'md5:e292006574f5deb5552c1ad677cee8dd', + 'ext': 'webp', + 'id': 'DI3mC0GxkYA', + 'like_count': int, + 'timestamp': 1745582191, + 'title': 'md5:e292006574f5deb5552c1ad677cee8dd', + 'upload_date': '20250425', + 'uploader': 'zuck', + 'uploader_id': '63055343223', + 'uploader_url': 'https://www.threads.com/@zuck', + }, + }, + { + 'note': 'Private Post', + 'url': 'https://www.threads.com/@enucatl/post/DLIrVcmPuFA7g5tn9OzPjsA-R8qU2HPJv_FzCo0', + 'info_dict': { + 'channel': 'enucatl', + 'channel_is_verified': False, + 'channel_url': 'https://www.threads.com/@enucatl', + 'description': '', + 'id': 'DLIrVcmPuFA7g5tn9OzPjsA-R8qU2HPJv_FzCo0', + 'like_count': int, + 'timestamp': 1745582191, + 'title': '', + 'upload_date': '20250620', + 'uploader': 'enucatl', + 'uploader_id': '63055343223', + 'uploader_url': 'https://www.threads.com/@enucatl', + }, + 'skip': 'private account, requires authentication', + }, + ] + + def _perform_login(self, username, password): + # We are not implementing direct login. Cookies are preferred. + self.raise_login_required( + 'Login with username/password is not supported. ' + 'Use --cookies or --cookies-from-browser to provide authentication.', + method='cookies', + ) + + def _real_extract(self, url): + post_id = self._match_id(url) + webpage = self._download_webpage(url, post_id, note='Downloading post page') + + json_data = None + # Match single scripts + for script in re.findall(r']*>(.*?)', webpage, re.DOTALL): + # Heuristic check: if the script doesn't contain "RelayPrefetchedStreamCache" and the post_id, + # it's definitely not the one we want. Skip it quickly. + if 'RelayPrefetchedStreamCache' not in script or post_id not in script: + continue + + # This script is a candidate. Try to parse it. + # We use fatal=False because we expect some candidates to fail parsing. + candidate_json = self._search_json(r'"result":', script, 'result data', post_id, fatal=False) + + if not candidate_json: + continue + + post_data = traverse_obj( + candidate_json, + ( + 'data', + 'data', + 'edges', + ), + ) + + if post_data is not None: + json_data = post_data + break + + if not json_data: + self.raise_no_formats( + 'Could not extract post data. The post may be private or deleted. You may need to log in.', + expected=True, + ) + + main_post = None + for node in json_data: + for item in traverse_obj(node, ('node', 'thread_items'), default=[]): + post_candidate = item.get('post') + if traverse_obj(post_candidate, 'code') == post_id: + main_post = post_candidate + break + if main_post: + break + + if not main_post: + self.raise_no_formats('Could not find post data matching the post ID.', expected=True) + + # This metadata applies to the whole post (the playlist). + uploader = traverse_obj(main_post, ('user', 'username')) + title = ( + strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads')) + or traverse_obj(main_post, ('caption', 'text')) + or f'Post by {uploader}' + ) + + playlist_metadata = { + 'id': post_id, + 'title': title, + 'description': self._og_search_description(webpage) or traverse_obj(main_post, ('caption', 'text')), + 'uploader': uploader, + 'uploader_id': traverse_obj(main_post, ('user', 'pk')), + 'uploader_url': f'https://www.threads.com/@{uploader}', + 'channel': uploader, + 'channel_url': f'https://www.threads.com/@{uploader}', + 'channel_is_verified': traverse_obj(main_post, ('user', 'is_verified')), + 'timestamp': int_or_none(main_post.get('taken_at')), + 'like_count': int_or_none(main_post.get('like_count')), + } + + media_list = main_post.get('carousel_media') or [main_post] + playlist_entries = [] + + for i, media in enumerate(media_list): + entry_id = f'{post_id}_{i + 1}' if len(media_list) > 1 else post_id + + # --- VIDEO --- + if media.get('video_versions'): + formats = [] + for video in media.get('video_versions'): + formats.append({ + # 'format_id' is optional, yt-dlp can generate it + 'url': video.get('url'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + }) + + # Create a dictionary for THIS video entry + playlist_entries.append({ + 'id': entry_id, + 'title': title, # The title is shared by all entries + 'formats': formats, + 'thumbnail': traverse_obj(media, ('image_versions2', 'candidates', 0, 'url')), + # Add any media-specific metadata here + }) + continue # Move to the next media item + + # --- IMAGE --- + image_candidates = traverse_obj(media, ('image_versions2', 'candidates')) + if image_candidates: + best_image = image_candidates[0] + playlist_entries.append({ + 'id': entry_id, + 'title': title, + 'url': best_image.get('url'), + 'ext': determine_ext(best_image.get('url'), 'jpg'), + 'width': int_or_none(best_image.get('width')), + 'height': int_or_none(best_image.get('height')), + 'vcodec': 'none', # This tells yt-dlp it's an image + }) + + if not playlist_entries: + self.raise_no_formats('This post contains no downloadable video or images.', expected=True) + + if len(playlist_entries) == 1: + return {**playlist_entries[0], **playlist_metadata} + + return self.playlist_result(playlist_entries, **playlist_metadata) + + +class ThreadsIOSIE(InfoExtractor): + IE_DESC = 'IOS barcelona:// URL' + _VALID_URL = r'barcelona://media\?shortcode=(?P[^/?#&]+)' + _TESTS = [ + { + 'url': 'barcelona://media?shortcode=C6fDehepo5D', + 'info_dict': { + 'channel': 'saopaulofc', + 'channel_is_verified': bool, + 'channel_url': 'https://www.threads.com/@saopaulofc', + 'description': 'md5:0c36a7e67e1517459bc0334dba932164', + 'ext': 'mp4', + 'id': 'C6fDehepo5D', + 'like_count': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1714694014, + 'title': 'md5:be7fe42330e2e78e969ca30254535d0b', + 'upload_date': '20240502', + 'uploader': 'saopaulofc', + 'uploader_id': '63360239523', + 'uploader_url': 'https://www.threads.com/@saopaulofc', + }, + 'add_ie': ['Threads'], + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Threads doesn't care about the user url, it redirects to the right one + # So we use ** instead so that we don't need to find it + return self.url_result(f'https://www.threads.net/t/{video_id}', ThreadsIE, video_id) From 5386879dd59fa9b83131f42332da2df273ac0a7c Mon Sep 17 00:00:00 2001 From: Matteo Abis <1423701+Enucatl@users.noreply.github.com> Date: Sat, 21 Jun 2025 08:32:15 +0200 Subject: [PATCH 2/4] allow matching scripts independent of case --- yt_dlp/extractor/threads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py index bc007583f..0aa5ca6dc 100644 --- a/yt_dlp/extractor/threads.py +++ b/yt_dlp/extractor/threads.py @@ -127,7 +127,7 @@ def _real_extract(self, url): json_data = None # Match single scripts - for script in re.findall(r']*>(.*?)', webpage, re.DOTALL): + for script in re.findall(r']*>(.*?)', webpage, re.DOTALL | re.IGNORECASE): # Heuristic check: if the script doesn't contain "RelayPrefetchedStreamCache" and the post_id, # it's definitely not the one we want. Skip it quickly. if 'RelayPrefetchedStreamCache' not in script or post_id not in script: From ee8133b07779ee15f116daed4a81ee49eb7238ab Mon Sep 17 00:00:00 2001 From: Matteo Abis <1423701+Enucatl@users.noreply.github.com> Date: Sat, 21 Jun 2025 09:44:21 +0200 Subject: [PATCH 3/4] codeql fixes --- yt_dlp/extractor/threads.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py index 0aa5ca6dc..3f01c0f34 100644 --- a/yt_dlp/extractor/threads.py +++ b/yt_dlp/extractor/threads.py @@ -100,6 +100,7 @@ class ThreadsIE(InfoExtractor): 'channel_is_verified': False, 'channel_url': 'https://www.threads.com/@enucatl', 'description': '', + 'ext': 'mp4', 'id': 'DLIrVcmPuFA7g5tn9OzPjsA-R8qU2HPJv_FzCo0', 'like_count': int, 'timestamp': 1745582191, @@ -126,13 +127,15 @@ def _real_extract(self, url): webpage = self._download_webpage(url, post_id, note='Downloading post page') json_data = None - # Match single scripts - for script in re.findall(r']*>(.*?)', webpage, re.DOTALL | re.IGNORECASE): - # Heuristic check: if the script doesn't contain "RelayPrefetchedStreamCache" and the post_id, - # it's definitely not the one we want. Skip it quickly. - if 'RelayPrefetchedStreamCache' not in script or post_id not in script: - continue + json_scripts = re.findall( + r'