mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[youtube:tab] Extract more metadata from feeds/channels/playlists (#1018)
Parse relative time text, extract live, upcoming status, availability and channel id from feeds/channels/playlists (where applicable). Closes #1883 Authored-by: coletdjnz
This commit is contained in:
		| @@ -55,6 +55,7 @@ from ..utils import ( | ||||
|     smuggle_url, | ||||
|     str_or_none, | ||||
|     str_to_int, | ||||
|     strftime_or_none, | ||||
|     traverse_obj, | ||||
|     try_get, | ||||
|     unescapeHTML, | ||||
| @@ -358,7 +359,20 @@ class YoutubeBaseInfoExtractor(InfoExtractor): | ||||
|             consent_id = random.randint(100, 999) | ||||
|         self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) | ||||
|  | ||||
|     def _initialize_pref(self): | ||||
|         cookies = self._get_cookies('https://www.youtube.com/') | ||||
|         pref_cookie = cookies.get('PREF') | ||||
|         pref = {} | ||||
|         if pref_cookie: | ||||
|             try: | ||||
|                 pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) | ||||
|             except ValueError: | ||||
|                 self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) | ||||
|         pref.update({'hl': 'en'}) | ||||
|         self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) | ||||
|  | ||||
|     def _real_initialize(self): | ||||
|         self._initialize_pref() | ||||
|         self._initialize_consent() | ||||
|         self._login() | ||||
|  | ||||
| @@ -391,23 +405,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): | ||||
|         return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) | ||||
|  | ||||
|     def _extract_context(self, ytcfg=None, default_client='web'): | ||||
|         _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) | ||||
|         context = _get_context(ytcfg) | ||||
|         if context: | ||||
|             return context | ||||
|  | ||||
|         context = _get_context(self._get_default_ytcfg(default_client)) | ||||
|         if not ytcfg: | ||||
|             return context | ||||
|  | ||||
|         # Recreate the client context (required) | ||||
|         context['client'].update({ | ||||
|             'clientVersion': self._extract_client_version(ytcfg, default_client), | ||||
|             'clientName': self._extract_client_name(ytcfg, default_client), | ||||
|         }) | ||||
|         visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) | ||||
|         if visitor_data: | ||||
|             context['client']['visitorData'] = visitor_data | ||||
|         context = get_first( | ||||
|             (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) | ||||
|         # Enforce language for extraction | ||||
|         traverse_obj(context, 'client', expected_type=dict, default={})['hl'] = 'en' | ||||
|         return context | ||||
|  | ||||
|     _SAPISID = None | ||||
| @@ -664,6 +665,29 @@ class YoutubeBaseInfoExtractor(InfoExtractor): | ||||
|                 if text: | ||||
|                     return text | ||||
|  | ||||
|     @staticmethod | ||||
|     def extract_relative_time(relative_time_text): | ||||
|         """ | ||||
|         Extracts a relative time from string and converts to dt object | ||||
|         e.g. 'streamed 6 days ago', '5 seconds ago (edited)' | ||||
|         """ | ||||
|         mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) | ||||
|         if mobj: | ||||
|             try: | ||||
|                 return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto') | ||||
|             except ValueError: | ||||
|                 return None | ||||
|  | ||||
|     def _extract_time_text(self, renderer, *path_list): | ||||
|         text = self._get_text(renderer, *path_list) or '' | ||||
|         dt = self.extract_relative_time(text) | ||||
|         timestamp = None | ||||
|         if isinstance(dt, datetime.datetime): | ||||
|             timestamp = calendar.timegm(dt.timetuple()) | ||||
|         if text and timestamp is None: | ||||
|             self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True) | ||||
|         return timestamp, text | ||||
|  | ||||
|     def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, | ||||
|                           ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, | ||||
|                           default_client='web'): | ||||
| @@ -750,7 +774,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): | ||||
|             'view count', default=None)) | ||||
|  | ||||
|         uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') | ||||
|  | ||||
|         channel_id = traverse_obj( | ||||
|             renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False) | ||||
|         timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') | ||||
|         scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) | ||||
|         overlay_style = traverse_obj( | ||||
|             renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) | ||||
|         badges = self._extract_badges(renderer) | ||||
|         return { | ||||
|             '_type': 'url', | ||||
|             'ie_key': YoutubeIE.ie_key(), | ||||
| @@ -761,6 +791,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor): | ||||
|             'duration': duration, | ||||
|             'view_count': view_count, | ||||
|             'uploader': uploader, | ||||
|             'channel_id': channel_id, | ||||
|             'upload_date': strftime_or_none(timestamp, '%Y%m%d'), | ||||
|             'live_status': ('is_upcoming' if scheduled_timestamp is not None | ||||
|                             else 'was_live' if 'streamed' in time_text.lower() | ||||
|                             else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges | ||||
|                             else None), | ||||
|             'release_timestamp': scheduled_timestamp, | ||||
|             'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges) | ||||
|         } | ||||
|  | ||||
|  | ||||
| @@ -2064,19 +2102,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | ||||
|             (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), | ||||
|              regex), webpage, name, default='{}'), video_id, fatal=False) | ||||
|  | ||||
|     @staticmethod | ||||
|     def parse_time_text(time_text): | ||||
|         """ | ||||
|         Parse the comment time text | ||||
|         time_text is in the format 'X units ago (edited)' | ||||
|         """ | ||||
|         time_text_split = time_text.split(' ') | ||||
|         if len(time_text_split) >= 3: | ||||
|             try: | ||||
|                 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto') | ||||
|             except ValueError: | ||||
|                 return None | ||||
|  | ||||
|     def _extract_comment(self, comment_renderer, parent=None): | ||||
|         comment_id = comment_renderer.get('commentId') | ||||
|         if not comment_id: | ||||
| @@ -2085,10 +2110,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | ||||
|         text = self._get_text(comment_renderer, 'contentText') | ||||
|  | ||||
|         # note: timestamp is an estimate calculated from the current time and time_text | ||||
|         time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' | ||||
|         time_text_dt = self.parse_time_text(time_text) | ||||
|         if isinstance(time_text_dt, datetime.datetime): | ||||
|             timestamp = calendar.timegm(time_text_dt.timetuple()) | ||||
|         timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText') | ||||
|         author = self._get_text(comment_renderer, 'authorText') | ||||
|         author_id = try_get(comment_renderer, | ||||
|                             lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) | ||||
| @@ -2261,11 +2283,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | ||||
|             yield from self._comment_entries(renderer, ytcfg, video_id) | ||||
|  | ||||
|         max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) | ||||
|         # Force English regardless of account setting to prevent parsing issues | ||||
|         # See: https://github.com/yt-dlp/yt-dlp/issues/532 | ||||
|         ytcfg = copy.deepcopy(ytcfg) | ||||
|         traverse_obj( | ||||
|             ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en' | ||||
|         return itertools.islice(_real_comment_extract(contents), 0, max_comments) | ||||
|  | ||||
|     @staticmethod | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 coletdjnz
					coletdjnz