From 439b931411ad4fbf49c678b0f7e5793ccbbd3d45 Mon Sep 17 00:00:00 2001 From: DrJumba <220088158+DrJumba@users.noreply.github.com> Date: Thu, 10 Jul 2025 18:50:04 +0530 Subject: [PATCH 1/2] [ie/pornhub] Add chapters support --- yt_dlp/extractor/pornhub.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index e1e9777e8e..e4bd1d3885 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -267,6 +267,27 @@ class PornHubIE(PornHubBaseIE): def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex(pattern, webpage, f'{name} count', default=None)) + def _extract_chapters_from_action_tags(self, action_tags, duration): + if not action_tags: + return None + + chapter_list = [] + for entry in action_tags.split(','): + if ':' not in entry: + continue + title, start_str = entry.split(':', 1) + start_time = int_or_none(start_str) + if start_time is not None: + chapter_list.append({'title': title.strip(), 'start_time': start_time}) + + for i, chapter in enumerate(chapter_list): + if i + 1 < len(chapter_list): + chapter['end_time'] = chapter_list[i + 1]['start_time'] + elif duration is not None: + chapter['end_time'] = duration + + return chapter_list or None + def _real_extract(self, url): mobj = self._match_valid_url(url) host = mobj.group('host') or 'pornhub.com' @@ -325,6 +346,7 @@ def dl_webpage(platform): }) thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) + chapters = self._extract_chapters_from_action_tags(flashvars.get('actionTags'), duration) media_definitions = flashvars.get('mediaDefinitions') if isinstance(media_definitions, list): for definition in media_definitions: @@ -339,7 +361,7 @@ def dl_webpage(platform): video_urls.append( (video_url, int_or_none(definition.get('quality')))) else: - thumbnail, duration = [None] * 2 + thumbnail, duration, chapters = [None] * 3 def extract_js_vars(webpage, pattern, default=NO_DEFAULT): assignments = self._search_regex( @@ -499,6 +521,7 @@ def extract_list(meta_key): 'title': title, 'thumbnail': thumbnail, 'duration': duration, + 'chapters': chapters, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, From 1565a11e9a1229f60b859cce375bb4a9995d9896 Mon Sep 17 00:00:00 2001 From: DrJumba <220088158+DrJumba@users.noreply.github.com> Date: Fri, 11 Jul 2025 04:43:33 +0530 Subject: [PATCH 2/2] [ie/pornhub] fix code --- yt_dlp/extractor/pornhub.py | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index e4bd1d3885..51e637081e 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -20,6 +20,7 @@ remove_quotes, remove_start, str_to_int, + traverse_obj, update_url_query, url_or_none, urlencode_postdata, @@ -267,27 +268,6 @@ class PornHubIE(PornHubBaseIE): def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex(pattern, webpage, f'{name} count', default=None)) - def _extract_chapters_from_action_tags(self, action_tags, duration): - if not action_tags: - return None - - chapter_list = [] - for entry in action_tags.split(','): - if ':' not in entry: - continue - title, start_str = entry.split(':', 1) - start_time = int_or_none(start_str) - if start_time is not None: - chapter_list.append({'title': title.strip(), 'start_time': start_time}) - - for i, chapter in enumerate(chapter_list): - if i + 1 < len(chapter_list): - chapter['end_time'] = chapter_list[i + 1]['start_time'] - elif duration is not None: - chapter['end_time'] = duration - - return chapter_list or None - def _real_extract(self, url): mobj = self._match_valid_url(url) host = mobj.group('host') or 'pornhub.com' @@ -346,7 +326,12 @@ def dl_webpage(platform): }) thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) - chapters = self._extract_chapters_from_action_tags(flashvars.get('actionTags'), duration) + chapters = traverse_obj(flashvars, ( + 'actionTags', {lambda x: x.split(',')}, ..., {lambda x: x.split(':', 1)}, + all, lambda _, v: int_or_none(v[1]) is not None, { + 'title': (0, {str.strip}), + 'start_time': (1, {int_or_none}), + })) or None media_definitions = flashvars.get('mediaDefinitions') if isinstance(media_definitions, list): for definition in media_definitions: