mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[xhamster] Extract all formats and fix duration extraction (#13593)
This commit is contained in:
		| @@ -3,6 +3,7 @@ from __future__ import unicode_literals | ||||
| import re | ||||
|  | ||||
| from .common import InfoExtractor | ||||
| from ..compat import compat_str | ||||
| from ..utils import ( | ||||
|     clean_html, | ||||
|     dict_get, | ||||
| @@ -28,6 +29,7 @@ class XHamsterIE(InfoExtractor): | ||||
|         'md5': '8281348b8d3c53d39fffb377d24eac4e', | ||||
|         'info_dict': { | ||||
|             'id': '1509445', | ||||
|             'display_id': 'femaleagent_shy_beauty_takes_the_bait', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'FemaleAgent Shy beauty takes the bait', | ||||
|             'upload_date': '20121014', | ||||
| @@ -40,6 +42,7 @@ class XHamsterIE(InfoExtractor): | ||||
|         'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', | ||||
|         'info_dict': { | ||||
|             'id': '2221348', | ||||
|             'display_id': 'britney_spears_sexy_booty', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Britney Spears  Sexy Booty', | ||||
|             'upload_date': '20130914', | ||||
| @@ -81,18 +84,7 @@ class XHamsterIE(InfoExtractor): | ||||
|     }] | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         def extract_video_url(webpage, name): | ||||
|             return self._search_regex( | ||||
|                 [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', | ||||
|                  r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', | ||||
|                  r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], | ||||
|                 webpage, name, group='mp4') | ||||
|  | ||||
|         def is_hd(webpage): | ||||
|             return '<div class=\'icon iconHD\'' in webpage | ||||
|  | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|  | ||||
|         video_id = mobj.group('id') or mobj.group('id_2') | ||||
|         display_id = mobj.group('display_id') or mobj.group('display_id_2') | ||||
|  | ||||
| @@ -110,6 +102,39 @@ class XHamsterIE(InfoExtractor): | ||||
|              r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'], | ||||
|             webpage, 'title') | ||||
|  | ||||
|         formats = [] | ||||
|         format_urls = set() | ||||
|  | ||||
|         sources = self._parse_json( | ||||
|             self._search_regex( | ||||
|                 r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', | ||||
|                 default='{}'), | ||||
|             video_id, fatal=False) | ||||
|         for format_id, format_url in sources.items(): | ||||
|             if not isinstance(format_url, compat_str): | ||||
|                 continue | ||||
|             if format_url in format_urls: | ||||
|                 continue | ||||
|             format_urls.add(format_url) | ||||
|             formats.append({ | ||||
|                 'format_id': format_id, | ||||
|                 'url': format_url, | ||||
|                 'height': int_or_none(self._search_regex( | ||||
|                     r'^(\d+)[pP]', format_id, 'height', default=None)) | ||||
|             }) | ||||
|  | ||||
|         video_url = self._search_regex( | ||||
|             [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', | ||||
|              r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', | ||||
|              r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], | ||||
|             webpage, 'video url', group='mp4', default=None) | ||||
|         if video_url and video_url not in format_urls: | ||||
|             formats.append({ | ||||
|                 'url': video_url, | ||||
|             }) | ||||
|  | ||||
|         self._sort_formats(formats) | ||||
|  | ||||
|         # Only a few videos have an description | ||||
|         mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) | ||||
|         description = mobj.group(1) if mobj else None | ||||
| @@ -128,7 +153,8 @@ class XHamsterIE(InfoExtractor): | ||||
|             webpage, 'thumbnail', fatal=False, group='thumbnail') | ||||
|  | ||||
|         duration = parse_duration(self._search_regex( | ||||
|             r'Runtime:\s*</span>\s*([\d:]+)', webpage, | ||||
|             [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', | ||||
|              r'Runtime:\s*</span>\s*([\d:]+)'], webpage, | ||||
|             'duration', fatal=False)) | ||||
|  | ||||
|         view_count = int_or_none(self._search_regex( | ||||
| @@ -143,30 +169,6 @@ class XHamsterIE(InfoExtractor): | ||||
|  | ||||
|         age_limit = self._rta_search(webpage) | ||||
|  | ||||
|         hd = is_hd(webpage) | ||||
|  | ||||
|         format_id = 'hd' if hd else 'sd' | ||||
|  | ||||
|         video_url = extract_video_url(webpage, format_id) | ||||
|         formats = [{ | ||||
|             'url': video_url, | ||||
|             'format_id': 'hd' if hd else 'sd', | ||||
|             'preference': 1, | ||||
|         }] | ||||
|  | ||||
|         if not hd: | ||||
|             mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url') | ||||
|             webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') | ||||
|             if is_hd(webpage): | ||||
|                 video_url = extract_video_url(webpage, 'hd') | ||||
|                 formats.append({ | ||||
|                     'url': video_url, | ||||
|                     'format_id': 'hd', | ||||
|                     'preference': 2, | ||||
|                 }) | ||||
|  | ||||
|         self._sort_formats(formats) | ||||
|  | ||||
|         categories_html = self._search_regex( | ||||
|             r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage, | ||||
|             'categories', default=None) | ||||
| @@ -175,6 +177,7 @@ class XHamsterIE(InfoExtractor): | ||||
|  | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'display_id': display_id, | ||||
|             'title': title, | ||||
|             'description': description, | ||||
|             'upload_date': upload_date, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Sergey M․
					Sergey M․