diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index fbbd9571f..8c3de8426 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2521,8 +2521,12 @@ from .xnxx import XNXXIE from .xstream import XstreamIE from .xvideos import ( + XVideosChannelIE, XVideosIE, + XVideosPlaylistIE, XVideosQuickiesIE, + XVideosRelatedIE, + XVideosSearchIE, ) from .xxxymovies import XXXYMoviesIE from .yahoo import ( diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index e7d43ba9d..0543f7950 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -1,3 +1,4 @@ +import itertools import re import urllib.parse @@ -6,8 +7,12 @@ ExtractorError, clean_html, determine_ext, + extract_attributes, int_or_none, parse_duration, + try_get, + url_basename, + urljoin, ) @@ -15,10 +20,10 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?xvideos2?\.com/video\.?| - (?:www\.)?xvideos\.es/video\.?| - (?:www|flashservice)\.xvideos\.com/embedframe/| - static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= + (?:[^/]+\.)?xvideos(?:\d+)?\.com/video\.?| + (?:www\.)?xvideos(?:\d+)?\.es/video\.?| + (?:www|flashservice)\.xvideos(?:\d+)?\.com/embedframe/| + static-hw\.xvideos(?:\d+)?\.com/swf/xv-player\.swf\?.*?\bid_video= ) (?P[0-9a-z]+) ''' @@ -173,7 +178,7 @@ def _real_extract(self, url): class XVideosQuickiesIE(InfoExtractor): IE_NAME = 'xvideos:quickies' - _VALID_URL = r'https?://(?P(?:[^/?#]+\.)?xvideos2?\.com)/(?:profiles/|amateur-channels/)?[^/?#]+#quickies/a/(?P\w+)' + _VALID_URL = r'https?://(?P(?:[^/?#]+\.)?xvideos(?:\d+)?\.com)/(?:profiles/|amateur-channels/)?[^/?#]+#quickies/a/(?P\w+)' _TESTS = [{ 'url': 'https://www.xvideos.com/lili_love#quickies/a/ipdtikh1a4c', 'md5': 'f9e4f518ff1de14b99a400bbd0fc5ee0', @@ -223,3 +228,249 @@ class XVideosQuickiesIE(InfoExtractor): def _real_extract(self, url): domain, id_ = self._match_valid_url(url).group('domain', 'id') return self.url_result(f'https://{domain}/video{"" if id_.isdecimal() else "."}{id_}/_', XVideosIE, id_) + + +class XVideosPlaylistIE(InfoExtractor): + _VALID_URL = r'''(?x) + ^(?!.*\#quickies) # Reject if "#quickies" appears anywhere + https?:// + (?:[^/]+\.)?xvideos(?:\d+)?\.com/ + (?:c(?:/[sm]:[^/]+)*| + profiles| + favorite)/ + (?P[^#?/]+) + ''' + _TESTS = [] + + def _extract_videos_from_json_list(self, json_list, path='video'): + return ( + ( + 'https://www.xvideos.com/%s.%s/%s' % (path, x.get('eid'), str(x.get('u')).split('/')[-1]) + if x.get('eid') is not None + else 'https://www.xvideos.com/%s%d/%s' % (path, x.get('id'), str(x.get('u')).split('/')[-1]) + ) + for x in json_list if isinstance(x, dict) + ) + + def _get_playlist_url(self, url, playlist_id): + """URL of first playlist page""" + id_match = re.match(self._VALID_URL, url).groupdict() + video_sort = id_match.get('sort') + if video_sort: + url, _ = urllib.parse.urldefrag(url) + if url.endswith('/'): + url = url[:-1] + url = '%s/%s' % (url, video_sort.replace('-', '/')) + return url + + def _get_next_page(self, url, num, page): + """URL of num th continuation page of url""" + if page.startswith('{'): + url, sub = re.subn(r'(/)(\d{1,7})($|[#?/])', r'\g<1>%d\3' % (num, ), url) + if sub == 0: + url += '/%d' % (num, ) + return url + next_page = self._search_regex( + r'''(?s)(]*?\bclass\s*=\s*(?P'|").*?\bnext-page\b.*?(?P=q)[^>]*?>)''', + page, 'next page', default=None) + if next_page: + next_page = extract_attributes(next_page) + next_page = next_page.get('href') + if next_page: + return urljoin(url, next_page) + return False + + def _extract_videos(self, url, playlist_id, num, page): + """Get iterable videos plus stop flag""" + return (( + 'https://www.xvideos.com/video' + x.group('video_id') + for x in re.finditer(r'''class\s*=\s*"title"\s*>\s*<\s*a\s*href\s*=\s*(\'|")\/video(?P(.*?))\1''', page)), + None) + + def _real_extract(self, url): + id_match = re.match(self._VALID_URL, url).groupdict() + playlist_id = id_match['id'] + if 'video' in playlist_id and url.endswith(playlist_id): + url += '/0' + + next_page = self._get_playlist_url(url, playlist_id) + + matches = [] + for count in itertools.count(0): + webpage = self._download_webpage( + next_page, + '%s (+%d)' % (playlist_id, count) if count > 0 else playlist_id) + + vids, stop = self._extract_videos(next_page, playlist_id, count, webpage) + + if vids: + matches.append(vids) + + if stop: + break + next_page = self._get_next_page(next_page, count + 1, webpage) + if not next_page: + break + + return self.playlist_from_matches( + itertools.chain.from_iterable(matches), playlist_id) + + +class XVideosRelatedIE(XVideosPlaylistIE): + _VALID_URL = XVideosIE._VALID_URL + r'(?:/[^/]+)*?\#_related-(?Pvideos|playlists)' + + _TESTS = [] + + def _extract_videos(self, url, playlist_id, num, page): + id_match = re.match(self._VALID_URL, url).groupdict() + related = id_match.get('related') + if not related: + return super()._extract_videos(url, playlist_id, num, page) + + if related == 'videos': + related_json = self._search_regex( + r'(?s)videos_related\s*=\s*(\[.*?])\s*;', + page, 'related', default='[]') + related_json = self._parse_json(related_json, playlist_id, fatal=False) or [] + return (self._extract_videos_from_json_list(related_json), True) + # playlists + related_json = self._download_json( + 'https://www.xvideos.com/video-playlists/' + playlist_id, playlist_id, fatal=False) + + return ( + self._extract_videos_from_json_list( + try_get(related_json, lambda x: x['playlists'], list) or [], + path='favorite/'), + True) + + +class XVideosChannelIE(XVideosPlaylistIE): + _CHANNEL_REGEX = r'''(?:amateur-|model-)?(?:channel|profile|pornstar|model|amateur)s/''' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos(?:\d+)?\.com/ + (?: + %s + )? + (?P[^#?/]+) + (?:\#_tab(?PVideos|Favorites|Playlists|AboutMe)(?:,(?P[^,]+))?)? + $ + ''' % _CHANNEL_REGEX + _TESTS = [{ + 'url': 'https://www.xvideos.com/pornstars/silvia-jons#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/channels/miss_floyd#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/models/migurt-1', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos3.com/amateurs/shaiden_rogue5#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos3.com/natalia--starr#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos3.com/porn_force#_tabVideos', + 'playlist_mincount': 5, + }] + + def _get_playlist_url(self, url, playlist_id): + id_match = re.match(self._VALID_URL, url).groupdict() + tab = (id_match.get('tab') or '').lower() + + if not tab: + url += '#_tabVideos' + + if tab: + if tab in ('videos', 'favorites'): + url, frag = urllib.parse.urldefrag(url) + if not url.endswith('/'): + url += '/' + parsed = urllib.parse.urlparse(url) + if not re.search(r'^/' + self._CHANNEL_REGEX, parsed.path): + path_parts = parsed.path.lstrip('/').split('/', 1) + new_path = '/channels/' + path_parts[0] + if len(path_parts) > 1: + new_path += '/' + path_parts[1] + url = parsed._replace(path=new_path).geturl() + frag = frag.split(',') + url += tab + if tab == 'videos': + url += '/' + (frag[1] if len(frag) > 1 else 'best') + url += '/0' + return url + + webpage = self._download_webpage(url, playlist_id) + + # activity + conf = self._search_regex( + r'(?s)\.\s*xv\s*\.\s*conf\s*=\s*(\{.*?})[\s;]* 0) + + if tab == 'favorites': + return (( + 'https://www.xvideos.com' + x.group('playlist') + for x in re.finditer(r''']*?href\s*=\s*('|")(?P/favorite/\d+/[^#?]+?)\1''', page)), + None) + + return super()._extract_videos(url, playlist_id, num, page) + + +class XVideosSearchIE(XVideosPlaylistIE): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos(?:\d+)?\.com/ + \?k=(?P[^#?/&]+) + ''' + _TESTS = [{ + # uninteresting search with probably at least two pages of results, + # but not too many more + 'url': 'http://www.xvideos.com/?k=libya&sort=length', + 'playlist_mincount': 30, + }] + + def _get_next_page(self, url, num, page): + parsed_url = urllib.parse.urlparse(url) + qs = urllib.parse.parse_qs(parsed_url.query) + qs['p'] = [num] + parsed_url = [*parsed_url[:4], urllib.parse.urlencode(qs, True), None] + return urllib.parse.urlunparse(parsed_url), False