From f2ee2a46fc2a4efb6ed58ee9e67c506c6b72b843 Mon Sep 17 00:00:00 2001 From: norepro <30921834+norepro@users.noreply.github.com> Date: Fri, 12 Dec 2025 12:52:09 -0800 Subject: [PATCH] [ie/pornhub] Optimize metadata extraction (#15231) Closes #14621 Authored by: norepro --- yt_dlp/extractor/pornhub.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index e1e9777e8e..cdfa3f1b0f 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -24,6 +24,7 @@ from ..utils import ( url_or_none, urlencode_postdata, ) +from ..utils.traversal import find_elements, traverse_obj class PornHubBaseIE(InfoExtractor): @@ -137,23 +138,24 @@ class PornHubIE(PornHubBaseIE): _EMBED_REGEX = [r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)'] _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', - 'md5': 'a6391306d050e4547f62b3f485dd9ba9', + 'md5': '4d4a4e9178b655776f86cf89ecaf0edf', 'info_dict': { 'id': '648719015', 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', - 'uploader': 'Babes', + 'uploader': 'BABES-COM', + 'uploader_id': '/users/babes-com', 'upload_date': '20130628', 'timestamp': 1372447216, 'duration': 361, 'view_count': int, 'like_count': int, - 'dislike_count': int, 'comment_count': int, 'age_limit': 18, 'tags': list, 'categories': list, 'cast': list, + 'thumbnail': r're:https?://.+', }, }, { # non-ASCII title @@ -480,13 +482,6 @@ class PornHubIE(PornHubBaseIE): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - def extract_list(meta_key): - div = self._search_regex( - rf'(?s)]+\bclass=["\'].*?\b{meta_key}Wrapper[^>]*>(.+?)', - webpage, meta_key, default=None) - if div: - return [clean_html(x).strip() for x in re.findall(r'(?s)]+\bhref=[^>]+>.+?', div)] - info = self._search_json_ld(webpage, video_id, default={}) # description provided in JSON-LD is irrelevant info['description'] = None @@ -505,9 +500,11 @@ class PornHubIE(PornHubBaseIE): 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, - 'tags': extract_list('tags'), - 'categories': extract_list('categories'), - 'cast': extract_list('pornstars'), + **traverse_obj(webpage, { + 'tags': ({find_elements(attr='data-label', value='tag')}, ..., {clean_html}), + 'categories': ({find_elements(attr='data-label', value='category')}, ..., {clean_html}), + 'cast': ({find_elements(attr='data-label', value='pornstar')}, ..., {clean_html}), + }), 'subtitles': subtitles, }, info)