mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[extractor] Improve _generic_title
				
					
				
			This commit is contained in:
		| @@ -303,9 +303,7 @@ class ArteTVCategoryIE(ArteTVBaseIE): | ||||
|             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): | ||||
|                 items.append(video) | ||||
| 
 | ||||
|         title = (self._og_search_title(webpage, default=None) | ||||
|                  or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None)) | ||||
|         title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) | ||||
|         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None | ||||
| 
 | ||||
|         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, | ||||
|                                           description=self._og_search_description(webpage, default=None)) | ||||
|   | ||||
| @@ -898,12 +898,8 @@ class BBCIE(BBCCoUkIE): | ||||
|         json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) | ||||
|         timestamp = json_ld_info.get('timestamp') | ||||
| 
 | ||||
|         playlist_title = json_ld_info.get('title') | ||||
|         if not playlist_title: | ||||
|             playlist_title = (self._og_search_title(webpage, default=None) | ||||
|                               or self._html_extract_title(webpage, 'playlist title', default=None)) | ||||
|             if playlist_title: | ||||
|                 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() | ||||
|         playlist_title = json_ld_info.get('title') or re.sub( | ||||
|             r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None | ||||
| 
 | ||||
|         playlist_description = json_ld_info.get( | ||||
|             'description') or self._og_search_description(webpage, default=None) | ||||
|   | ||||
| @@ -27,8 +27,7 @@ class BreitBartIE(InfoExtractor): | ||||
|         self._sort_formats(formats) | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': (self._og_search_title(webpage, default=None) | ||||
|                       or self._html_extract_title(webpage, 'video title')), | ||||
|             'title': self._generic_title('', webpage), | ||||
|             'description': self._og_search_description(webpage), | ||||
|             'thumbnail': self._og_search_thumbnail(webpage), | ||||
|             'age_limit': self._rta_search(webpage), | ||||
|   | ||||
| @@ -51,9 +51,7 @@ class CallinIE(InfoExtractor): | ||||
|         episode = next_data['props']['pageProps']['episode'] | ||||
| 
 | ||||
|         id = episode['id'] | ||||
|         title = (episode.get('title') | ||||
|                  or self._og_search_title(webpage, fatal=False) | ||||
|                  or self._html_extract_title(webpage)) | ||||
|         title = episode.get('title') or self._generic_title('', webpage) | ||||
|         url = episode['m3u8'] | ||||
|         formats = self._extract_m3u8_formats(url, display_id, ext='ts') | ||||
|         self._sort_formats(formats) | ||||
|   | ||||
| @@ -3820,9 +3820,11 @@ class InfoExtractor: | ||||
|     def _generic_id(url): | ||||
|         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _generic_title(url): | ||||
|         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) | ||||
|     def _generic_title(self, url='', webpage='', *, default=None): | ||||
|         return (self._og_search_title(webpage, default=None) | ||||
|                 or self._html_extract_title(webpage, default=None) | ||||
|                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) | ||||
|                 or default) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): | ||||
|   | ||||
| @@ -275,8 +275,7 @@ class CSpanCongressIE(InfoExtractor): | ||||
|             self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), | ||||
|             video_id, transform_source=js_to_json) | ||||
| 
 | ||||
|         title = (self._og_search_title(webpage, default=None) | ||||
|                  or self._html_extract_title(webpage, 'video title')) | ||||
|         title = self._generic_title('', webpage) | ||||
|         description = (self._og_search_description(webpage, default=None) | ||||
|                        or self._html_search_meta('description', webpage, 'description', default=None)) | ||||
| 
 | ||||
|   | ||||
| @@ -71,7 +71,7 @@ class FiveTVIE(InfoExtractor): | ||||
|              r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], | ||||
|             webpage, 'video url') | ||||
| 
 | ||||
|         title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) | ||||
|         title = self._generic_title('', webpage) | ||||
|         duration = int_or_none(self._og_search_property( | ||||
|             'video:duration', webpage, 'duration', default=None)) | ||||
| 
 | ||||
|   | ||||
| @@ -2740,8 +2740,7 @@ class GenericIE(InfoExtractor): | ||||
|             #   Site Name | Video Title | ||||
|             #   Video Title - Tagline | Site Name | ||||
|             # and so on and so forth; it's just not practical | ||||
|             'title': (self._og_search_title(webpage, default=None) | ||||
|                       or self._html_extract_title(webpage, 'video title', default='video')), | ||||
|             'title': self._generic_title('', webpage, default='video'), | ||||
|             'description': self._og_search_description(webpage, default=None), | ||||
|             'thumbnail': self._og_search_thumbnail(webpage, default=None), | ||||
|             'age_limit': self._rta_search(webpage), | ||||
|   | ||||
| @@ -20,7 +20,7 @@ class HTML5MediaEmbedIE(InfoExtractor): | ||||
|     ] | ||||
| 
 | ||||
|     def _extract_from_webpage(self, url, webpage): | ||||
|         video_id, title = self._generic_id(url), self._generic_title(url) | ||||
|         video_id, title = self._generic_id(url), self._generic_title(url, webpage) | ||||
|         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or [] | ||||
|         for num, entry in enumerate(entries, start=1): | ||||
|             entry.update({ | ||||
|   | ||||
| @@ -20,7 +20,7 @@ class GlideIE(InfoExtractor): | ||||
| 
 | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
| 
 | ||||
|         title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) | ||||
|         title = self._generic_title('', webpage) | ||||
|         video_url = self._proto_relative_url(self._search_regex( | ||||
|             r'<source[^>]+src=(["\'])(?P<url>.+?)\1', | ||||
|             webpage, 'video URL', default=None, | ||||
|   | ||||
| @@ -48,9 +48,7 @@ class MeipaiIE(InfoExtractor): | ||||
|         video_id = self._match_id(url) | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
| 
 | ||||
|         title = self._og_search_title( | ||||
|             webpage, default=None) or self._html_search_regex( | ||||
|             r'<title[^>]*>([^<]+)</title>', webpage, 'title') | ||||
|         title = self._generic_title('', webpage) | ||||
| 
 | ||||
|         formats = [] | ||||
| 
 | ||||
|   | ||||
| @@ -321,8 +321,7 @@ class NhkForSchoolProgramListIE(InfoExtractor): | ||||
| 
 | ||||
|         webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) | ||||
| 
 | ||||
|         title = (self._og_search_title(webpage) | ||||
|                  or self._html_extract_title(webpage) | ||||
|         title = (self._generic_title('', webpage) | ||||
|                  or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)) | ||||
|         title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None | ||||
|         description = self._html_search_regex( | ||||
|   | ||||
| @@ -106,7 +106,6 @@ class OneNewsNZIE(InfoExtractor): | ||||
| 
 | ||||
|         playlist_title = ( | ||||
|             traverse_obj(fusion_metadata, ('headlines', 'basic')) | ||||
|             or self._og_search_title(webpage) | ||||
|             or self._html_extract_title(webpage) | ||||
|             or self._generic_title('', webpage) | ||||
|         ) | ||||
|         return self.playlist_result(entries, display_id, playlist_title) | ||||
|   | ||||
| @@ -166,7 +166,7 @@ class SteamCommunityBroadcastIE(InfoExtractor): | ||||
|         self._sort_formats(formats) | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': self._html_extract_title(webpage) or self._og_search_title(webpage), | ||||
|             'title': self._generic_title('', webpage), | ||||
|             'formats': formats, | ||||
|             'live_status': 'is_live', | ||||
|             'view_count': json_data.get('num_view'), | ||||
|   | ||||
| @@ -142,7 +142,7 @@ class TennisTVIE(InfoExtractor): | ||||
| 
 | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': self._html_extract_title(webpage) or self._og_search_title(webpage), | ||||
|             'title': self._generic_title('', webpage), | ||||
|             'description': self._html_search_regex( | ||||
|                 (r'<span itemprop="description" content=["\']([^"\']+)["\']>', *self._og_regexes('description')), | ||||
|                 webpage, 'description', fatal=False), | ||||
|   | ||||
| @@ -74,6 +74,6 @@ class TV24UAVideoIE(InfoExtractor): | ||||
|             'formats': formats, | ||||
|             'subtitles': subtitles, | ||||
|             'thumbnail': thumbnail or self._og_search_thumbnail(webpage), | ||||
|             'title': self._html_extract_title(webpage) or self._og_search_title(webpage), | ||||
|             'title': self._generic_title('', webpage), | ||||
|             'description': self._og_search_description(webpage, default=None), | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 pukkandan
					pukkandan