mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[extractor/rutube] Extract chapters from description (#6345)
Authored by: mushbite
This commit is contained in:
		| @@ -3649,6 +3649,38 @@ class InfoExtractor: | |||||||
|                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) |                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) | ||||||
|                 or default) |                 or default) | ||||||
| 
 | 
 | ||||||
|  |     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True): | ||||||
|  |         if not duration: | ||||||
|  |             return | ||||||
|  |         chapter_list = [{ | ||||||
|  |             'start_time': start_function(chapter), | ||||||
|  |             'title': title_function(chapter), | ||||||
|  |         } for chapter in chapter_list or []] | ||||||
|  |         if not strict: | ||||||
|  |             chapter_list.sort(key=lambda c: c['start_time'] or 0) | ||||||
|  | 
 | ||||||
|  |         chapters = [{'start_time': 0}] | ||||||
|  |         for idx, chapter in enumerate(chapter_list): | ||||||
|  |             if chapter['start_time'] is None: | ||||||
|  |                 self.report_warning(f'Incomplete chapter {idx}') | ||||||
|  |             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: | ||||||
|  |                 chapters.append(chapter) | ||||||
|  |             elif chapter not in chapters: | ||||||
|  |                 self.report_warning( | ||||||
|  |                     f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') | ||||||
|  |         return chapters[1:] | ||||||
|  | 
 | ||||||
|  |     def _extract_chapters_from_description(self, description, duration): | ||||||
|  |         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' | ||||||
|  |         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' | ||||||
|  |         return self._extract_chapters_helper( | ||||||
|  |             re.findall(sep_re % (duration_re, r'.+?'), description or ''), | ||||||
|  |             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1], | ||||||
|  |             duration=duration, strict=False) or self._extract_chapters_helper( | ||||||
|  |             re.findall(sep_re % (r'.+?', duration_re), description or ''), | ||||||
|  |             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0], | ||||||
|  |             duration=duration, strict=False) | ||||||
|  | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): |     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): | ||||||
|         all_known = all(map( |         all_known = all(map( | ||||||
|   | |||||||
| @@ -25,8 +25,7 @@ class RutubeBaseIE(InfoExtractor): | |||||||
|             video_id, 'Downloading video JSON', |             video_id, 'Downloading video JSON', | ||||||
|             'Unable to download video JSON', query=query) |             'Unable to download video JSON', query=query) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     def _extract_info(self, video, video_id=None, require_title=True): | ||||||
|     def _extract_info(video, video_id=None, require_title=True): |  | ||||||
|         title = video['title'] if require_title else video.get('title') |         title = video['title'] if require_title else video.get('title') | ||||||
| 
 | 
 | ||||||
|         age_limit = video.get('is_adult') |         age_limit = video.get('is_adult') | ||||||
| @@ -35,13 +34,15 @@ class RutubeBaseIE(InfoExtractor): | |||||||
| 
 | 
 | ||||||
|         uploader_id = try_get(video, lambda x: x['author']['id']) |         uploader_id = try_get(video, lambda x: x['author']['id']) | ||||||
|         category = try_get(video, lambda x: x['category']['name']) |         category = try_get(video, lambda x: x['category']['name']) | ||||||
|  |         description = video.get('description') | ||||||
|  |         duration = int_or_none(video.get('duration')) | ||||||
| 
 | 
 | ||||||
|         return { |         return { | ||||||
|             'id': video.get('id') or video_id if video_id else video['id'], |             'id': video.get('id') or video_id if video_id else video['id'], | ||||||
|             'title': title, |             'title': title, | ||||||
|             'description': video.get('description'), |             'description': description, | ||||||
|             'thumbnail': video.get('thumbnail_url'), |             'thumbnail': video.get('thumbnail_url'), | ||||||
|             'duration': int_or_none(video.get('duration')), |             'duration': duration, | ||||||
|             'uploader': try_get(video, lambda x: x['author']['name']), |             'uploader': try_get(video, lambda x: x['author']['name']), | ||||||
|             'uploader_id': compat_str(uploader_id) if uploader_id else None, |             'uploader_id': compat_str(uploader_id) if uploader_id else None, | ||||||
|             'timestamp': unified_timestamp(video.get('created_ts')), |             'timestamp': unified_timestamp(video.get('created_ts')), | ||||||
| @@ -50,6 +51,7 @@ class RutubeBaseIE(InfoExtractor): | |||||||
|             'view_count': int_or_none(video.get('hits')), |             'view_count': int_or_none(video.get('hits')), | ||||||
|             'comment_count': int_or_none(video.get('comments_count')), |             'comment_count': int_or_none(video.get('comments_count')), | ||||||
|             'is_live': bool_or_none(video.get('is_livestream')), |             'is_live': bool_or_none(video.get('is_livestream')), | ||||||
|  |             'chapters': self._extract_chapters_from_description(description, duration), | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|     def _download_and_extract_info(self, video_id, query=None): |     def _download_and_extract_info(self, video_id, query=None): | ||||||
| @@ -111,8 +113,9 @@ class RutubeIE(RutubeBaseIE): | |||||||
|             'view_count': int, |             'view_count': int, | ||||||
|             'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', |             'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', | ||||||
|             'category': ['Новости и СМИ'], |             'category': ['Новости и СМИ'], | ||||||
| 
 |             'chapters': [], | ||||||
|         }, |         }, | ||||||
|  |         'expected_warnings': ['Unable to download f4m'], | ||||||
|     }, { |     }, { | ||||||
|         'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', |         'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', | ||||||
|         'only_matching': True, |         'only_matching': True, | ||||||
| @@ -142,7 +145,28 @@ class RutubeIE(RutubeBaseIE): | |||||||
|             'view_count': int, |             'view_count': int, | ||||||
|             'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', |             'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', | ||||||
|             'category': ['Видеоигры'], |             'category': ['Видеоигры'], | ||||||
|  |             'chapters': [], | ||||||
|         }, |         }, | ||||||
|  |         'expected_warnings': ['Unable to download f4m'], | ||||||
|  |     }, { | ||||||
|  |         'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': 'c65b465ad0c98c89f3b25cb03dcc87c6', | ||||||
|  |             'ext': 'mp4', | ||||||
|  |             'chapters': 'count:4', | ||||||
|  |             'category': ['Бизнес и предпринимательство'], | ||||||
|  |             'description': 'md5:252feac1305257d8c1bab215cedde75d', | ||||||
|  |             'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png', | ||||||
|  |             'duration': 782, | ||||||
|  |             'age_limit': 0, | ||||||
|  |             'uploader_id': '23491359', | ||||||
|  |             'timestamp': 1677153329, | ||||||
|  |             'view_count': int, | ||||||
|  |             'upload_date': '20230223', | ||||||
|  |             'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании', | ||||||
|  |             'uploader': 'Стас Быков', | ||||||
|  |         }, | ||||||
|  |         'expected_warnings': ['Unable to download f4m'], | ||||||
|     }] |     }] | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|   | |||||||
| @@ -3205,11 +3205,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters' |                 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters' | ||||||
|             ), expected_type=list) |             ), expected_type=list) | ||||||
| 
 | 
 | ||||||
|         return self._extract_chapters( |         return self._extract_chapters_helper( | ||||||
|             chapter_list, |             chapter_list, | ||||||
|             chapter_time=lambda chapter: float_or_none( |             start_function=lambda chapter: float_or_none( | ||||||
|                 traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000), |                 traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000), | ||||||
|             chapter_title=lambda chapter: traverse_obj( |             title_function=lambda chapter: traverse_obj( | ||||||
|                 chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str), |                 chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str), | ||||||
|             duration=duration) |             duration=duration) | ||||||
| 
 | 
 | ||||||
| @@ -3222,42 +3222,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|         chapter_title = lambda chapter: self._get_text(chapter, 'title') |         chapter_title = lambda chapter: self._get_text(chapter, 'title') | ||||||
| 
 | 
 | ||||||
|         return next(filter(None, ( |         return next(filter(None, ( | ||||||
|             self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), |             self._extract_chapters_helper(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), | ||||||
|                                           chapter_time, chapter_title, duration) |                                           chapter_time, chapter_title, duration) | ||||||
|             for contents in content_list)), []) |             for contents in content_list)), []) | ||||||
| 
 | 
 | ||||||
|     def _extract_chapters_from_description(self, description, duration): |  | ||||||
|         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' |  | ||||||
|         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' |  | ||||||
|         return self._extract_chapters( |  | ||||||
|             re.findall(sep_re % (duration_re, r'.+?'), description or ''), |  | ||||||
|             chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1], |  | ||||||
|             duration=duration, strict=False) or self._extract_chapters( |  | ||||||
|             re.findall(sep_re % (r'.+?', duration_re), description or ''), |  | ||||||
|             chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0], |  | ||||||
|             duration=duration, strict=False) |  | ||||||
| 
 |  | ||||||
|     def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True): |  | ||||||
|         if not duration: |  | ||||||
|             return |  | ||||||
|         chapter_list = [{ |  | ||||||
|             'start_time': chapter_time(chapter), |  | ||||||
|             'title': chapter_title(chapter), |  | ||||||
|         } for chapter in chapter_list or []] |  | ||||||
|         if not strict: |  | ||||||
|             chapter_list.sort(key=lambda c: c['start_time'] or 0) |  | ||||||
| 
 |  | ||||||
|         chapters = [{'start_time': 0}] |  | ||||||
|         for idx, chapter in enumerate(chapter_list): |  | ||||||
|             if chapter['start_time'] is None: |  | ||||||
|                 self.report_warning(f'Incomplete chapter {idx}') |  | ||||||
|             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: |  | ||||||
|                 chapters.append(chapter) |  | ||||||
|             elif chapter not in chapters: |  | ||||||
|                 self.report_warning( |  | ||||||
|                     f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') |  | ||||||
|         return chapters[1:] |  | ||||||
| 
 |  | ||||||
|     def _extract_comment(self, comment_renderer, parent=None): |     def _extract_comment(self, comment_renderer, parent=None): | ||||||
|         comment_id = comment_renderer.get('commentId') |         comment_id = comment_renderer.get('commentId') | ||||||
|         if not comment_id: |         if not comment_id: | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 mushbite
					mushbite