mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[youtube] Extract comments' approximate timestamp (#221)
Authored by: colethedj
This commit is contained in:
		| @@ -2,6 +2,7 @@ | |||||||
|  |  | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
|  |  | ||||||
|  | import calendar | ||||||
| import hashlib | import hashlib | ||||||
| import itertools | import itertools | ||||||
| import json | import json | ||||||
| @@ -27,6 +28,7 @@ from ..utils import ( | |||||||
|     bool_or_none, |     bool_or_none, | ||||||
|     clean_html, |     clean_html, | ||||||
|     dict_get, |     dict_get, | ||||||
|  |     datetime_from_str, | ||||||
|     ExtractorError, |     ExtractorError, | ||||||
|     format_field, |     format_field, | ||||||
|     float_or_none, |     float_or_none, | ||||||
| @@ -46,7 +48,7 @@ from ..utils import ( | |||||||
|     update_url_query, |     update_url_query, | ||||||
|     url_or_none, |     url_or_none, | ||||||
|     urlencode_postdata, |     urlencode_postdata, | ||||||
|     urljoin, |     urljoin | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1499,6 +1501,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), |             (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), | ||||||
|              regex), webpage, name, default='{}'), video_id, fatal=False) |              regex), webpage, name, default='{}'), video_id, fatal=False) | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def parse_time_text(time_text): | ||||||
|  |         """ | ||||||
|  |         Parse the comment time text | ||||||
|  |         time_text is in the format 'X units ago (edited)' | ||||||
|  |         """ | ||||||
|  |         time_text_split = time_text.split(' ') | ||||||
|  |         if len(time_text_split) >= 3: | ||||||
|  |             return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto') | ||||||
|  |  | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def _join_text_entries(runs): |     def _join_text_entries(runs): | ||||||
|         text = None |         text = None | ||||||
| @@ -1521,7 +1533,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|         text = self._join_text_entries(comment_text_runs) or '' |         text = self._join_text_entries(comment_text_runs) or '' | ||||||
|         comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or [] |         comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or [] | ||||||
|         time_text = self._join_text_entries(comment_time_text) |         time_text = self._join_text_entries(comment_time_text) | ||||||
|  |         timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple()) | ||||||
|         author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str) |         author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str) | ||||||
|         author_id = try_get(comment_renderer, |         author_id = try_get(comment_renderer, | ||||||
|                             lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) |                             lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) | ||||||
| @@ -1532,11 +1544,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|  |  | ||||||
|         author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) |         author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) | ||||||
|         is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool) |         is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool) | ||||||
|  |  | ||||||
|         return { |         return { | ||||||
|             'id': comment_id, |             'id': comment_id, | ||||||
|             'text': text, |             'text': text, | ||||||
|             # TODO: This should be parsed to timestamp |             'timestamp': timestamp, | ||||||
|             'time_text': time_text, |             'time_text': time_text, | ||||||
|             'like_count': votes, |             'like_count': votes, | ||||||
|             'is_favorited': is_liked, |             'is_favorited': is_liked, | ||||||
| @@ -1624,12 +1635,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                     comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1]) |                     comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1]) | ||||||
|                     if page_num == 0: |                     if page_num == 0: | ||||||
|                         if first_continuation: |                         if first_continuation: | ||||||
|                             note_prefix = "Downloading initial comment continuation page" |                             note_prefix = 'Downloading initial comment continuation page' | ||||||
|                         else: |                         else: | ||||||
|                             note_prefix = "    Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str) |                             note_prefix = '    Downloading comment reply thread %d %s' % (comment_counts[2], comment_prog_str) | ||||||
|                     else: |                     else: | ||||||
|                         note_prefix = "%sDownloading comment%s page %d %s" % ( |                         note_prefix = '%sDownloading comment%s page %d %s' % ( | ||||||
|                             "       " if parent else "", |                             '       ' if parent else '', | ||||||
|                             ' replies' if parent else '', |                             ' replies' if parent else '', | ||||||
|                             page_num, |                             page_num, | ||||||
|                             comment_prog_str) |                             comment_prog_str) | ||||||
| @@ -1644,13 +1655,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                 except ExtractorError as e: |                 except ExtractorError as e: | ||||||
|                     if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413): |                     if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413): | ||||||
|                         if e.cause.code == 413: |                         if e.cause.code == 413: | ||||||
|                             self.report_warning("Assumed end of comments (received HTTP Error 413)") |                             self.report_warning('Assumed end of comments (received HTTP Error 413)') | ||||||
|                             return |                             return | ||||||
|                         # Downloading page may result in intermittent 5xx HTTP error |                         # Downloading page may result in intermittent 5xx HTTP error | ||||||
|                         # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 |                         # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 | ||||||
|                         last_error = 'HTTP Error %s' % e.cause.code |                         last_error = 'HTTP Error %s' % e.cause.code | ||||||
|                         if e.cause.code == 404: |                         if e.cause.code == 404: | ||||||
|                             last_error = last_error + " (this API is probably deprecated)" |                             last_error = last_error + ' (this API is probably deprecated)' | ||||||
|                         if count < retries: |                         if count < retries: | ||||||
|                             continue |                             continue | ||||||
|                     raise |                     raise | ||||||
| @@ -1668,7 +1679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|  |  | ||||||
|                     # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth) |                     # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth) | ||||||
|                     if browse.get('reload'): |                     if browse.get('reload'): | ||||||
|                         raise ExtractorError("Invalid or missing params in continuation request", expected=False) |                         raise ExtractorError('Invalid or missing params in continuation request', expected=False) | ||||||
|  |  | ||||||
|                     # TODO: not tested, merged from old extractor |                     # TODO: not tested, merged from old extractor | ||||||
|                     err_msg = browse.get('externalErrorMessage') |                     err_msg = browse.get('externalErrorMessage') | ||||||
| @@ -1708,7 +1719,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|  |  | ||||||
|                     if expected_comment_count: |                     if expected_comment_count: | ||||||
|                         comment_counts[1] = str_to_int(expected_comment_count) |                         comment_counts[1] = str_to_int(expected_comment_count) | ||||||
|                         self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count)) |                         self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count)) | ||||||
|                         yield comment_counts[1] |                         yield comment_counts[1] | ||||||
|  |  | ||||||
|                     # TODO: cli arg. |                     # TODO: cli arg. | ||||||
| @@ -1724,7 +1735,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                         continuation = YoutubeTabIE._build_continuation_query( |                         continuation = YoutubeTabIE._build_continuation_query( | ||||||
|                             continuation=sort_continuation_renderer.get('continuation'), |                             continuation=sort_continuation_renderer.get('continuation'), | ||||||
|                             ctp=sort_continuation_renderer.get('clickTrackingParams')) |                             ctp=sort_continuation_renderer.get('clickTrackingParams')) | ||||||
|                         self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest')) |                         self.to_screen('Sorting comments by %s' % ('popular' if comment_sort_index == 0 else 'newest')) | ||||||
|                         break |                         break | ||||||
|  |  | ||||||
|                 for entry in known_continuation_renderers[key](continuation_renderer): |                 for entry in known_continuation_renderers[key](continuation_renderer): | ||||||
| @@ -1757,7 +1768,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                         continue |                         continue | ||||||
|                     comments.append(comment) |                     comments.append(comment) | ||||||
|                 break |                 break | ||||||
|         self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total)) |         self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total)) | ||||||
|         return { |         return { | ||||||
|             'comments': comments, |             'comments': comments, | ||||||
|             'comment_count': len(comments), |             'comment_count': len(comments), | ||||||
| @@ -2979,7 +2990,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): | |||||||
|                     self.report_warning('%s. Retrying ...' % last_error) |                     self.report_warning('%s. Retrying ...' % last_error) | ||||||
|                 try: |                 try: | ||||||
|                     response = self._call_api( |                     response = self._call_api( | ||||||
|                         ep="browse", fatal=True, headers=headers, |                         ep='browse', fatal=True, headers=headers, | ||||||
|                         video_id='%s page %s' % (item_id, page_num), |                         video_id='%s page %s' % (item_id, page_num), | ||||||
|                         query={ |                         query={ | ||||||
|                             'continuation': continuation['continuation'], |                             'continuation': continuation['continuation'], | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 coletdjnz
					coletdjnz