From d141da87857bb7774bec18cf4939a912dd17bb53 Mon Sep 17 00:00:00 2001 From: Tahasanul Abraham Date: Fri, 21 Mar 2025 02:02:53 +0100 Subject: [PATCH 1/7] [xvideos] Support profiles, searches, channels and favourites --- yt_dlp/extractor/xvideos.py | 259 +++++++++++++++++++++++++++++++++++- 1 file changed, 254 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index e7d43ba9d..fef9ec476 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -1,3 +1,4 @@ +import itertools import re import urllib.parse @@ -6,8 +7,12 @@ ExtractorError, clean_html, determine_ext, + extract_attributes, int_or_none, parse_duration, + try_get, + url_basename, + urljoin, ) @@ -15,10 +20,10 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?xvideos2?\.com/video\.?| - (?:www\.)?xvideos\.es/video\.?| - (?:www|flashservice)\.xvideos\.com/embedframe/| - static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= + (?:[^/]+\.)?xvideos(?:\d+)?\.com/video\.?| + (?:www\.)?xvideos(?:\d+)?\.es/video\.?| + (?:www|flashservice)\.xvideos(?:\d+)?\.com/embedframe/| + static-hw\.xvideos(?:\d+)?\.com/swf/xv-player\.swf\?.*?\bid_video= ) (?P[0-9a-z]+) ''' @@ -173,7 +178,7 @@ def _real_extract(self, url): class XVideosQuickiesIE(InfoExtractor): IE_NAME = 'xvideos:quickies' - _VALID_URL = r'https?://(?P(?:[^/?#]+\.)?xvideos2?\.com)/(?:profiles/|amateur-channels/)?[^/?#]+#quickies/a/(?P\w+)' + _VALID_URL = r'https?://(?P(?:[^/?#]+\.)?xvideos(?:\d+)?\.com)/(?:profiles/|amateur-channels/)?[^/?#]+#quickies/a/(?P\w+)' _TESTS = [{ 'url': 'https://www.xvideos.com/lili_love#quickies/a/ipdtikh1a4c', 'md5': 'f9e4f518ff1de14b99a400bbd0fc5ee0', @@ -223,3 +228,247 @@ class XVideosQuickiesIE(InfoExtractor): def _real_extract(self, url): domain, id_ = self._match_valid_url(url).group('domain', 'id') return self.url_result(f'https://{domain}/video{"" if id_.isdecimal() else "."}{id_}/_', XVideosIE, id_) + + +class XVideosPlaylistIE(InfoExtractor): + _VALID_URL = r'''(?x) + ^(?!.*\#quickies) # Reject if "#quickies" appears anywhere + https?:// + (?:[^/]+\.)?xvideos(?:\d+)?\.com/ + (?:c(?:/[sm]:[^/]+)*| + profiles| + favorite)/ + (?P[^#?/]+) + ''' + _TESTS = [] + + def _extract_videos_from_json_list(self, json_list, path='video'): + return ( + 'https://www.xvideos.com/%s%d/%s' % (path, x.get('id'), str(x.get('u')).split('/')[-1]) + for x in json_list if isinstance(x, dict)) + + def _get_playlist_url(self, url, playlist_id): + """URL of first playlist page""" + id_match = re.match(self._VALID_URL, url).groupdict() + video_sort = id_match.get('sort') + if video_sort: + url, _ = urllib.parse.urldefrag(url) + if url.endswith('/'): + url = url[:-1] + url = '%s/%s' % (url, video_sort.replace('-', '/')) + return url + + def _get_next_page(self, url, num, page): + """URL of num th continuation page of url""" + if page.startswith('{'): + url, sub = re.subn(r'(/)(\d{1,7})($|[#?/])', r'\g<1>%d\3' % (num, ), url) + if sub == 0: + url += '/%d' % (num, ) + return url + next_page = self._search_regex( + r'''(?s)(]*?\bclass\s*=\s*(?P'|").*?\bnext-page\b.*?(?P=q)[^>]*?>)''', + page, 'next page', default=None) + if next_page: + next_page = extract_attributes(next_page) + next_page = next_page.get('href') + if next_page: + return urljoin(url, next_page) + return False + + def _extract_videos(self, url, playlist_id, num, page): + """Get iterable videos plus stop flag""" + return (( + 'https://www.xvideos.com/video' + x.group('video_id') + for x in re.finditer(r'''class\s*=\s*"title"\s*>\s*<\s*a\s*href\s*=\s*(\'|")\/video(?P(.*?))\1''', page)), + None) + + def _real_extract(self, url): + id_match = re.match(self._VALID_URL, url).groupdict() + playlist_id = id_match['id'] + if 'video' in playlist_id and url.endswith(playlist_id): + url += '/0' + + next_page = self._get_playlist_url(url, playlist_id) + + matches = [] + for count in itertools.count(0): + webpage = self._download_webpage( + next_page, + '%s (+%d)' % (playlist_id, count) if count > 0 else playlist_id) + + vids, stop = self._extract_videos(next_page, playlist_id, count, webpage) + + if vids: + matches.append(vids) + + if stop: + break + next_page = self._get_next_page(next_page, count + 1, webpage) + if not next_page: + break + + return self.playlist_from_matches( + itertools.chain.from_iterable(matches), playlist_id) + + +class XVideosRelatedIE(XVideosPlaylistIE): + _VALID_URL = XVideosIE._VALID_URL + r'(?:/[^/]+)*?\#_related-(?Pvideos|playlists)' + + _TESTS = [] + + def _extract_videos(self, url, playlist_id, num, page): + id_match = re.match(self._VALID_URL, url).groupdict() + related = id_match.get('related') + if not related: + return super()._extract_videos(url, playlist_id, num, page) + + if related == 'videos': + related_json = self._search_regex( + r'(?s)videos_related\s*=\s*(\[.*?])\s*;', + page, 'related', default='[]') + related_json = self._parse_json(related_json, playlist_id, fatal=False) or [] + return (self._extract_videos_from_json_list(related_json), True) + # playlists + related_json = self._download_json( + 'https://www.xvideos.com/video-playlists/' + playlist_id, playlist_id, fatal=False) + + return ( + self._extract_videos_from_json_list( + try_get(related_json, lambda x: x['playlists'], list) or [], + path='favorite/'), + True) + + +class XVideosChannelIE(XVideosPlaylistIE): + _CHANNEL_REGEX = r'''(?:amateur-|pornstar-|model-)?(?:channel|profile|pornstar|model|amateur)s/''' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos(?:\d+)?\.com/ + (?: + %s + )? + (?P[^#?/]+) + (?:\#_tab(?PVideos|Favorites|Playlists|AboutMe)(?:,(?P[^,]+))?)? + $ + ''' % _CHANNEL_REGEX + _TESTS = [{ + 'url': 'https://www.xvideos.com/pornstar-channels/sienna-west', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/pornstars/silvia-jons#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/channels/miss_floyd#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/models/migurt-1', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos3.com/amateurs/shaiden_rogue5#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos3.com/natalia--starr#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos3.com/porn_force#_tabVideos', + 'playlist_mincount': 5, + }] + + def _get_playlist_url(self, url, playlist_id): + id_match = re.match(self._VALID_URL, url).groupdict() + tab = (id_match.get('tab') or '').lower() + + if not tab: + url += '#_tabVideos' + + if tab: + if tab in ('videos', 'favorites'): + url, frag = urllib.parse.urldefrag(url) + if not url.endswith('/'): + url += '/' + if not re.search(self._CHANNEL_REGEX + r'$', url): + parsed = urllib.parse.urlparse(url) + path_parts = parsed.path.lstrip('/').split('/', 1) + new_path = '/channels/' + path_parts[0] + if len(path_parts) > 1: + new_path += '/' + path_parts[1] + url = parsed._replace(path=new_path).geturl() + frag = frag.split(',') + url += tab + if tab == 'videos': + url += '/' + (frag[1] if len(frag) > 1 else 'best') + url += '/0' + return url + + webpage = self._download_webpage(url, playlist_id) + + # activity + conf = self._search_regex( + r'(?s)\.\s*xv\s*\.\s*conf\s*=\s*(\{.*?})[\s;]* 0) + + if tab == 'favorites': + return (( + 'https://www.xvideos.com' + x.group('playlist') + for x in re.finditer(r''']*?href\s*=\s*('|")(?P/favorite/\d+/[^#?]+?)\1''', page)), + None) + + return super()._extract_videos(url, playlist_id, num, page) + + +class XVideosSearchIE(XVideosPlaylistIE): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos(?:\d+)?\.com/ + \?k=(?P[^#?/&]+) + ''' + _TESTS = [{ + # uninteresting search with probably at least two pages of results, + # but not too many more + 'url': 'http://www.xvideos.com/?k=libya&sort=length', + 'playlist_mincount': 30, + }] + + def _get_next_page(self, url, num, page): + parsed_url = urllib.parse.urlparse(url) + qs = urllib.parse.parse_qs(parsed_url.query) + qs['p'] = [num] + parsed_url = [*parsed_url[:4], urllib.parse.urlencode(qs, True), None] + return urllib.parse.urlunparse(parsed_url), False From bc0d6c6a9e80d28baee0b90d5df04b35d4907526 Mon Sep 17 00:00:00 2001 From: Tahasanul Abraham Date: Fri, 21 Mar 2025 02:05:34 +0100 Subject: [PATCH 2/7] [xvideos] Support profiles, searches, channels and favourites --- yt_dlp/extractor/xvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index fef9ec476..1877b793a 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -374,7 +374,7 @@ class XVideosChannelIE(XVideosPlaylistIE): 'playlist_mincount': 5, }] - def _get_playlist_url(self, url, playlist_id): + def _get_playlist_url(self, url, playlist_id): id_match = re.match(self._VALID_URL, url).groupdict() tab = (id_match.get('tab') or '').lower() From 05ee64abcf3e8f11f1d423e47336643dd039c763 Mon Sep 17 00:00:00 2001 From: Tahasanul Abraham Date: Fri, 21 Mar 2025 02:43:16 +0100 Subject: [PATCH 3/7] [xvideos] Support profiles, searches, channels and favourites --- yt_dlp/extractor/xvideos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index 1877b793a..3f02b10c8 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -386,8 +386,8 @@ def _get_playlist_url(self, url, playlist_id): url, frag = urllib.parse.urldefrag(url) if not url.endswith('/'): url += '/' - if not re.search(self._CHANNEL_REGEX + r'$', url): - parsed = urllib.parse.urlparse(url) + parsed = urllib.parse.urlparse(url) + if not re.search(r'^/' + self._CHANNEL_REGEX, parsed.path): path_parts = parsed.path.lstrip('/').split('/', 1) new_path = '/channels/' + path_parts[0] if len(path_parts) > 1: From a2779512cb3aa1137eaf2da83f68211a18f72fbd Mon Sep 17 00:00:00 2001 From: Tahasanul Abraham Date: Fri, 21 Mar 2025 02:55:52 +0100 Subject: [PATCH 4/7] [xvideos] Support profiles, searches, channels and favourites --- yt_dlp/extractor/xvideos.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index 3f02b10c8..e1d9f81f5 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -244,8 +244,13 @@ class XVideosPlaylistIE(InfoExtractor): def _extract_videos_from_json_list(self, json_list, path='video'): return ( - 'https://www.xvideos.com/%s%d/%s' % (path, x.get('id'), str(x.get('u')).split('/')[-1]) - for x in json_list if isinstance(x, dict)) + ( + 'https://www.xvideos.com/%s.%s/%s' % (path, x.get('eid'), str(x.get('u')).split('/')[-1]) + if x.get('eid') is not None + else 'https://www.xvideos.com/%s%d/%s' % (path, x.get('id'), str(x.get('u')).split('/')[-1]) + ) + for x in json_list if isinstance(x, dict) + ) def _get_playlist_url(self, url, playlist_id): """URL of first playlist page""" From bd61f17114d2b8ce00fc0e172d4490ca5e88d7df Mon Sep 17 00:00:00 2001 From: Tahasanul Abraham Date: Mon, 24 Mar 2025 22:24:44 +0100 Subject: [PATCH 5/7] [xvideos] Support profiles, searches, channels and favourites --- yt_dlp/extractor/_extractors.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 74a043b9c..833557b10 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2503,9 +2503,13 @@ from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE -from .xvideos import ( +from .xvideos import ( + XVideosChannelIE, XVideosIE, + XVideosPlaylistIE, XVideosQuickiesIE, + XVideosRelatedIE, + XVideosSearchIE, ) from .xxxymovies import XXXYMoviesIE from .yahoo import ( From 7f5c6f2fb67a3df66bae24da5c95c2dd73894489 Mon Sep 17 00:00:00 2001 From: Tahasanul Abraham Date: Mon, 24 Mar 2025 22:30:32 +0100 Subject: [PATCH 6/7] [xvideos] Support profiles, searches, channels and favourites --- yt_dlp/extractor/_extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 833557b10..24b9aa9a9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2503,7 +2503,7 @@ from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE -from .xvideos import ( +from .xvideos import ( XVideosChannelIE, XVideosIE, XVideosPlaylistIE, From 6817a2f0bacdf3446cea068aaf317fb4157b0cdc Mon Sep 17 00:00:00 2001 From: Tahasanul Abraham Date: Fri, 25 Apr 2025 16:31:20 +0200 Subject: [PATCH 7/7] [xvideos] Support profiles, searches, channels and favourites Made changes for the review --- yt_dlp/extractor/xvideos.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index e1d9f81f5..0543f7950 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -345,7 +345,7 @@ def _extract_videos(self, url, playlist_id, num, page): class XVideosChannelIE(XVideosPlaylistIE): - _CHANNEL_REGEX = r'''(?:amateur-|pornstar-|model-)?(?:channel|profile|pornstar|model|amateur)s/''' + _CHANNEL_REGEX = r'''(?:amateur-|model-)?(?:channel|profile|pornstar|model|amateur)s/''' _VALID_URL = r'''(?x) https?:// (?:[^/]+\.)?xvideos(?:\d+)?\.com/ @@ -357,9 +357,6 @@ class XVideosChannelIE(XVideosPlaylistIE): $ ''' % _CHANNEL_REGEX _TESTS = [{ - 'url': 'https://www.xvideos.com/pornstar-channels/sienna-west', - 'playlist_mincount': 5, - }, { 'url': 'https://www.xvideos.com/pornstars/silvia-jons#_tabVideos', 'playlist_mincount': 5, }, {