mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[extractor] Improve _generic_title
				
					
				
			This commit is contained in:
		| @@ -303,9 +303,7 @@ class ArteTVCategoryIE(ArteTVBaseIE): | |||||||
|             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): |             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): | ||||||
|                 items.append(video) |                 items.append(video) | ||||||
| 
 | 
 | ||||||
|         title = (self._og_search_title(webpage, default=None) |         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None | ||||||
|                  or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None)) |  | ||||||
|         title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) |  | ||||||
| 
 | 
 | ||||||
|         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, |         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, | ||||||
|                                           description=self._og_search_description(webpage, default=None)) |                                           description=self._og_search_description(webpage, default=None)) | ||||||
|   | |||||||
| @@ -898,12 +898,8 @@ class BBCIE(BBCCoUkIE): | |||||||
|         json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) |         json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) | ||||||
|         timestamp = json_ld_info.get('timestamp') |         timestamp = json_ld_info.get('timestamp') | ||||||
| 
 | 
 | ||||||
|         playlist_title = json_ld_info.get('title') |         playlist_title = json_ld_info.get('title') or re.sub( | ||||||
|         if not playlist_title: |             r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None | ||||||
|             playlist_title = (self._og_search_title(webpage, default=None) |  | ||||||
|                               or self._html_extract_title(webpage, 'playlist title', default=None)) |  | ||||||
|             if playlist_title: |  | ||||||
|                 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() |  | ||||||
| 
 | 
 | ||||||
|         playlist_description = json_ld_info.get( |         playlist_description = json_ld_info.get( | ||||||
|             'description') or self._og_search_description(webpage, default=None) |             'description') or self._og_search_description(webpage, default=None) | ||||||
|   | |||||||
| @@ -27,8 +27,7 @@ class BreitBartIE(InfoExtractor): | |||||||
|         self._sort_formats(formats) |         self._sort_formats(formats) | ||||||
|         return { |         return { | ||||||
|             'id': video_id, |             'id': video_id, | ||||||
|             'title': (self._og_search_title(webpage, default=None) |             'title': self._generic_title('', webpage), | ||||||
|                       or self._html_extract_title(webpage, 'video title')), |  | ||||||
|             'description': self._og_search_description(webpage), |             'description': self._og_search_description(webpage), | ||||||
|             'thumbnail': self._og_search_thumbnail(webpage), |             'thumbnail': self._og_search_thumbnail(webpage), | ||||||
|             'age_limit': self._rta_search(webpage), |             'age_limit': self._rta_search(webpage), | ||||||
|   | |||||||
| @@ -51,9 +51,7 @@ class CallinIE(InfoExtractor): | |||||||
|         episode = next_data['props']['pageProps']['episode'] |         episode = next_data['props']['pageProps']['episode'] | ||||||
| 
 | 
 | ||||||
|         id = episode['id'] |         id = episode['id'] | ||||||
|         title = (episode.get('title') |         title = episode.get('title') or self._generic_title('', webpage) | ||||||
|                  or self._og_search_title(webpage, fatal=False) |  | ||||||
|                  or self._html_extract_title(webpage)) |  | ||||||
|         url = episode['m3u8'] |         url = episode['m3u8'] | ||||||
|         formats = self._extract_m3u8_formats(url, display_id, ext='ts') |         formats = self._extract_m3u8_formats(url, display_id, ext='ts') | ||||||
|         self._sort_formats(formats) |         self._sort_formats(formats) | ||||||
|   | |||||||
| @@ -3820,9 +3820,11 @@ class InfoExtractor: | |||||||
|     def _generic_id(url): |     def _generic_id(url): | ||||||
|         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) |         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     def _generic_title(self, url='', webpage='', *, default=None): | ||||||
|     def _generic_title(url): |         return (self._og_search_title(webpage, default=None) | ||||||
|         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) |                 or self._html_extract_title(webpage, default=None) | ||||||
|  |                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) | ||||||
|  |                 or default) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): |     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): | ||||||
|   | |||||||
| @@ -275,8 +275,7 @@ class CSpanCongressIE(InfoExtractor): | |||||||
|             self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), |             self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), | ||||||
|             video_id, transform_source=js_to_json) |             video_id, transform_source=js_to_json) | ||||||
| 
 | 
 | ||||||
|         title = (self._og_search_title(webpage, default=None) |         title = self._generic_title('', webpage) | ||||||
|                  or self._html_extract_title(webpage, 'video title')) |  | ||||||
|         description = (self._og_search_description(webpage, default=None) |         description = (self._og_search_description(webpage, default=None) | ||||||
|                        or self._html_search_meta('description', webpage, 'description', default=None)) |                        or self._html_search_meta('description', webpage, 'description', default=None)) | ||||||
| 
 | 
 | ||||||
|   | |||||||
| @@ -71,7 +71,7 @@ class FiveTVIE(InfoExtractor): | |||||||
|              r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], |              r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], | ||||||
|             webpage, 'video url') |             webpage, 'video url') | ||||||
| 
 | 
 | ||||||
|         title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) |         title = self._generic_title('', webpage) | ||||||
|         duration = int_or_none(self._og_search_property( |         duration = int_or_none(self._og_search_property( | ||||||
|             'video:duration', webpage, 'duration', default=None)) |             'video:duration', webpage, 'duration', default=None)) | ||||||
| 
 | 
 | ||||||
|   | |||||||
| @@ -2740,8 +2740,7 @@ class GenericIE(InfoExtractor): | |||||||
|             #   Site Name | Video Title |             #   Site Name | Video Title | ||||||
|             #   Video Title - Tagline | Site Name |             #   Video Title - Tagline | Site Name | ||||||
|             # and so on and so forth; it's just not practical |             # and so on and so forth; it's just not practical | ||||||
|             'title': (self._og_search_title(webpage, default=None) |             'title': self._generic_title('', webpage, default='video'), | ||||||
|                       or self._html_extract_title(webpage, 'video title', default='video')), |  | ||||||
|             'description': self._og_search_description(webpage, default=None), |             'description': self._og_search_description(webpage, default=None), | ||||||
|             'thumbnail': self._og_search_thumbnail(webpage, default=None), |             'thumbnail': self._og_search_thumbnail(webpage, default=None), | ||||||
|             'age_limit': self._rta_search(webpage), |             'age_limit': self._rta_search(webpage), | ||||||
|   | |||||||
| @@ -20,7 +20,7 @@ class HTML5MediaEmbedIE(InfoExtractor): | |||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|     def _extract_from_webpage(self, url, webpage): |     def _extract_from_webpage(self, url, webpage): | ||||||
|         video_id, title = self._generic_id(url), self._generic_title(url) |         video_id, title = self._generic_id(url), self._generic_title(url, webpage) | ||||||
|         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or [] |         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or [] | ||||||
|         for num, entry in enumerate(entries, start=1): |         for num, entry in enumerate(entries, start=1): | ||||||
|             entry.update({ |             entry.update({ | ||||||
|   | |||||||
| @@ -20,7 +20,7 @@ class GlideIE(InfoExtractor): | |||||||
| 
 | 
 | ||||||
|         webpage = self._download_webpage(url, video_id) |         webpage = self._download_webpage(url, video_id) | ||||||
| 
 | 
 | ||||||
|         title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) |         title = self._generic_title('', webpage) | ||||||
|         video_url = self._proto_relative_url(self._search_regex( |         video_url = self._proto_relative_url(self._search_regex( | ||||||
|             r'<source[^>]+src=(["\'])(?P<url>.+?)\1', |             r'<source[^>]+src=(["\'])(?P<url>.+?)\1', | ||||||
|             webpage, 'video URL', default=None, |             webpage, 'video URL', default=None, | ||||||
|   | |||||||
| @@ -48,9 +48,7 @@ class MeipaiIE(InfoExtractor): | |||||||
|         video_id = self._match_id(url) |         video_id = self._match_id(url) | ||||||
|         webpage = self._download_webpage(url, video_id) |         webpage = self._download_webpage(url, video_id) | ||||||
| 
 | 
 | ||||||
|         title = self._og_search_title( |         title = self._generic_title('', webpage) | ||||||
|             webpage, default=None) or self._html_search_regex( |  | ||||||
|             r'<title[^>]*>([^<]+)</title>', webpage, 'title') |  | ||||||
| 
 | 
 | ||||||
|         formats = [] |         formats = [] | ||||||
| 
 | 
 | ||||||
|   | |||||||
| @@ -321,8 +321,7 @@ class NhkForSchoolProgramListIE(InfoExtractor): | |||||||
| 
 | 
 | ||||||
|         webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) |         webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) | ||||||
| 
 | 
 | ||||||
|         title = (self._og_search_title(webpage) |         title = (self._generic_title('', webpage) | ||||||
|                  or self._html_extract_title(webpage) |  | ||||||
|                  or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)) |                  or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)) | ||||||
|         title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None |         title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None | ||||||
|         description = self._html_search_regex( |         description = self._html_search_regex( | ||||||
|   | |||||||
| @@ -106,7 +106,6 @@ class OneNewsNZIE(InfoExtractor): | |||||||
| 
 | 
 | ||||||
|         playlist_title = ( |         playlist_title = ( | ||||||
|             traverse_obj(fusion_metadata, ('headlines', 'basic')) |             traverse_obj(fusion_metadata, ('headlines', 'basic')) | ||||||
|             or self._og_search_title(webpage) |             or self._generic_title('', webpage) | ||||||
|             or self._html_extract_title(webpage) |  | ||||||
|         ) |         ) | ||||||
|         return self.playlist_result(entries, display_id, playlist_title) |         return self.playlist_result(entries, display_id, playlist_title) | ||||||
|   | |||||||
| @@ -166,7 +166,7 @@ class SteamCommunityBroadcastIE(InfoExtractor): | |||||||
|         self._sort_formats(formats) |         self._sort_formats(formats) | ||||||
|         return { |         return { | ||||||
|             'id': video_id, |             'id': video_id, | ||||||
|             'title': self._html_extract_title(webpage) or self._og_search_title(webpage), |             'title': self._generic_title('', webpage), | ||||||
|             'formats': formats, |             'formats': formats, | ||||||
|             'live_status': 'is_live', |             'live_status': 'is_live', | ||||||
|             'view_count': json_data.get('num_view'), |             'view_count': json_data.get('num_view'), | ||||||
|   | |||||||
| @@ -142,7 +142,7 @@ class TennisTVIE(InfoExtractor): | |||||||
| 
 | 
 | ||||||
|         return { |         return { | ||||||
|             'id': video_id, |             'id': video_id, | ||||||
|             'title': self._html_extract_title(webpage) or self._og_search_title(webpage), |             'title': self._generic_title('', webpage), | ||||||
|             'description': self._html_search_regex( |             'description': self._html_search_regex( | ||||||
|                 (r'<span itemprop="description" content=["\']([^"\']+)["\']>', *self._og_regexes('description')), |                 (r'<span itemprop="description" content=["\']([^"\']+)["\']>', *self._og_regexes('description')), | ||||||
|                 webpage, 'description', fatal=False), |                 webpage, 'description', fatal=False), | ||||||
|   | |||||||
| @@ -74,6 +74,6 @@ class TV24UAVideoIE(InfoExtractor): | |||||||
|             'formats': formats, |             'formats': formats, | ||||||
|             'subtitles': subtitles, |             'subtitles': subtitles, | ||||||
|             'thumbnail': thumbnail or self._og_search_thumbnail(webpage), |             'thumbnail': thumbnail or self._og_search_thumbnail(webpage), | ||||||
|             'title': self._html_extract_title(webpage) or self._og_search_title(webpage), |             'title': self._generic_title('', webpage), | ||||||
|             'description': self._og_search_description(webpage, default=None), |             'description': self._og_search_description(webpage, default=None), | ||||||
|         } |         } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 pukkandan
					pukkandan