mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[youtube] Adds #1312 Download annotations
Adds #1321 Download annotations from youtube Annotations are downloaded and written to a .annotations.xml file using the https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=$VIDEOID API. Added unit test for annotations.
This commit is contained in:
		
							
								
								
									
										82
									
								
								test/test_write_annotations.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								test/test_write_annotations.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | |||||||
|  | #!/usr/bin/env python | ||||||
|  | # coding: utf-8 | ||||||
|  |  | ||||||
|  | import xml.etree.ElementTree | ||||||
|  | import os | ||||||
|  | import sys | ||||||
|  | import unittest | ||||||
|  |  | ||||||
|  | # Allow direct execution | ||||||
|  | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||||||
|  |  | ||||||
|  | import youtube_dl.YoutubeDL | ||||||
|  | import youtube_dl.extractor | ||||||
|  | from youtube_dl.utils import * | ||||||
|  | from .helper import try_rm | ||||||
|  |  | ||||||
|  | PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") | ||||||
|  |  | ||||||
|  | # General configuration (from __init__, not very elegant...) | ||||||
|  | jar = compat_cookiejar.CookieJar() | ||||||
|  | cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) | ||||||
|  | proxy_handler = compat_urllib_request.ProxyHandler() | ||||||
|  | opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) | ||||||
|  | compat_urllib_request.install_opener(opener) | ||||||
|  |  | ||||||
|  | class YoutubeDL(youtube_dl.YoutubeDL): | ||||||
|  |     def __init__(self, *args, **kwargs): | ||||||
|  |         super(YoutubeDL, self).__init__(*args, **kwargs) | ||||||
|  |         self.to_stderr = self.to_screen | ||||||
|  |  | ||||||
|  | with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: | ||||||
|  |     params = json.load(pf) | ||||||
|  | params['writeannotations'] = True | ||||||
|  | params['skip_download'] = True | ||||||
|  | params['writeinfojson'] = False | ||||||
|  | params['format'] = 'flv' | ||||||
|  |  | ||||||
|  | TEST_ID = 'gr51aVj-mLg' | ||||||
|  | ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' | ||||||
|  | EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] | ||||||
|  |  | ||||||
|  | class TestAnnotations(unittest.TestCase): | ||||||
|  |     def setUp(self): | ||||||
|  |         # Clear old files | ||||||
|  |         self.tearDown() | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def test_info_json(self): | ||||||
|  |         expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text. | ||||||
|  |         ie = youtube_dl.extractor.YoutubeIE() | ||||||
|  |         ydl = YoutubeDL(params) | ||||||
|  |         ydl.add_info_extractor(ie) | ||||||
|  |         ydl.download([TEST_ID]) | ||||||
|  |         self.assertTrue(os.path.exists(ANNOTATIONS_FILE)) | ||||||
|  |         annoxml = None | ||||||
|  |         with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof: | ||||||
|  |                 annoxml = xml.etree.ElementTree.parse(annof) | ||||||
|  |         self.assertTrue(annoxml is not None, 'Failed to parse annotations XML') | ||||||
|  |         root = annoxml.getroot() | ||||||
|  |         self.assertEqual(root.tag, 'document') | ||||||
|  |         annotationsTag = root.find('annotations') | ||||||
|  |         self.assertEqual(annotationsTag.tag, 'annotations') | ||||||
|  |         annotations = annotationsTag.findall('annotation') | ||||||
|  |  | ||||||
|  |         #Not all the annotations have TEXT children and the annotations are returned unsorted. | ||||||
|  |         for a in annotations: | ||||||
|  |                 self.assertEqual(a.tag, 'annotation') | ||||||
|  |                 if a.get('type') == 'text': | ||||||
|  |                         textTag = a.find('TEXT') | ||||||
|  |                         text = textTag.text | ||||||
|  |                         self.assertTrue(text in expected) #assertIn only added in python 2.7 | ||||||
|  |                         #remove the first occurance, there could be more than one annotation with the same text | ||||||
|  |                         expected.remove(text) | ||||||
|  |         #We should have seen (and removed) all the expected annotation texts. | ||||||
|  |         self.assertEqual(len(expected), 0, 'Not all expected annotations were found.') | ||||||
|  |          | ||||||
|  |  | ||||||
|  |     def tearDown(self): | ||||||
|  |         try_rm(ANNOTATIONS_FILE) | ||||||
|  |  | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     unittest.main() | ||||||
| @@ -71,6 +71,7 @@ class YoutubeDL(object): | |||||||
|     logtostderr:       Log messages to stderr instead of stdout. |     logtostderr:       Log messages to stderr instead of stdout. | ||||||
|     writedescription:  Write the video description to a .description file |     writedescription:  Write the video description to a .description file | ||||||
|     writeinfojson:     Write the video description to a .info.json file |     writeinfojson:     Write the video description to a .info.json file | ||||||
|  |     writeannotations:  Write the video annotations to a .annotations.xml file | ||||||
|     writethumbnail:    Write the thumbnail image to a file |     writethumbnail:    Write the thumbnail image to a file | ||||||
|     writesubtitles:    Write the video subtitles to a file |     writesubtitles:    Write the video subtitles to a file | ||||||
|     writeautomaticsub: Write the automatic subtitles to a file |     writeautomaticsub: Write the automatic subtitles to a file | ||||||
| @@ -258,6 +259,10 @@ class YoutubeDL(object): | |||||||
|         """ Report that the metadata file has been written """ |         """ Report that the metadata file has been written """ | ||||||
|         self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) |         self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) | ||||||
|  |  | ||||||
|  |     def report_writeannotations(self, annofn): | ||||||
|  |         """ Report that the annotations file has been written. """ | ||||||
|  |         self.to_screen(u'[info] Writing video annotations to: ' + annofn) | ||||||
|  |  | ||||||
|     def report_file_already_downloaded(self, file_name): |     def report_file_already_downloaded(self, file_name): | ||||||
|         """Report file has already been fully downloaded.""" |         """Report file has already been fully downloaded.""" | ||||||
|         try: |         try: | ||||||
| @@ -522,6 +527,18 @@ class YoutubeDL(object): | |||||||
|                 self.report_error(u'Cannot write description file ' + descfn) |                 self.report_error(u'Cannot write description file ' + descfn) | ||||||
|                 return |                 return | ||||||
|  |  | ||||||
|  |         if self.params.get('writeannotations', False): | ||||||
|  |             try: | ||||||
|  |                annofn = filename + u'.annotations.xml' | ||||||
|  |                self.report_writeannotations(annofn) | ||||||
|  |                with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: | ||||||
|  |                    annofile.write(info_dict['annotations']) | ||||||
|  |             except (KeyError, TypeError): | ||||||
|  |                 self.report_warning(u'There are no annotations to write.') | ||||||
|  |             except (OSError, IOError): | ||||||
|  |                  self.report_error(u'Cannot write annotations file: ' + annofn) | ||||||
|  |                  return | ||||||
|  |  | ||||||
|         subtitles_are_requested = any([self.params.get('writesubtitles', False), |         subtitles_are_requested = any([self.params.get('writesubtitles', False), | ||||||
|                                        self.params.get('writeautomaticsub')]) |                                        self.params.get('writeautomaticsub')]) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -339,6 +339,9 @@ def parseOpts(overrideArguments=None): | |||||||
|     filesystem.add_option('--write-info-json', |     filesystem.add_option('--write-info-json', | ||||||
|             action='store_true', dest='writeinfojson', |             action='store_true', dest='writeinfojson', | ||||||
|             help='write video metadata to a .info.json file', default=False) |             help='write video metadata to a .info.json file', default=False) | ||||||
|  |     filesystem.add_option('--write-annotations', | ||||||
|  |             action='store_true', dest='writeannotations', | ||||||
|  |             help='write video annotations to a .annotation file', default=False) | ||||||
|     filesystem.add_option('--write-thumbnail', |     filesystem.add_option('--write-thumbnail', | ||||||
|             action='store_true', dest='writethumbnail', |             action='store_true', dest='writethumbnail', | ||||||
|             help='write thumbnail image to disk', default=False) |             help='write thumbnail image to disk', default=False) | ||||||
| @@ -601,6 +604,7 @@ def _real_main(argv=None): | |||||||
|         'nopart': opts.nopart, |         'nopart': opts.nopart, | ||||||
|         'updatetime': opts.updatetime, |         'updatetime': opts.updatetime, | ||||||
|         'writedescription': opts.writedescription, |         'writedescription': opts.writedescription, | ||||||
|  |         'writeannotations': opts.writeannotations, | ||||||
|         'writeinfojson': opts.writeinfojson, |         'writeinfojson': opts.writeinfojson, | ||||||
|         'writethumbnail': opts.writethumbnail, |         'writethumbnail': opts.writethumbnail, | ||||||
|         'writesubtitles': opts.writesubtitles, |         'writesubtitles': opts.writesubtitles, | ||||||
|   | |||||||
| @@ -1250,6 +1250,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|             url_map[itag] = format_url |             url_map[itag] = format_url | ||||||
|         return url_map |         return url_map | ||||||
|  |  | ||||||
|  |     def _extract_annotations(self, video_id): | ||||||
|  |         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id | ||||||
|  |         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         # Extract original video URL from URL with redirection, like age verification, using next_url parameter |         # Extract original video URL from URL with redirection, like age verification, using next_url parameter | ||||||
|         mobj = re.search(self._NEXT_URL_RE, url) |         mobj = re.search(self._NEXT_URL_RE, url) | ||||||
| @@ -1382,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|         else: |         else: | ||||||
|             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) |             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) | ||||||
|  |  | ||||||
|  |         # annotations | ||||||
|  |         video_annotations = None | ||||||
|  |         if self._downloader.params.get('writeannotations', False): | ||||||
|  |                 video_annotations = self._extract_annotations(video_id) | ||||||
|  |  | ||||||
|         # Decide which formats to download |         # Decide which formats to download | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
| @@ -1495,6 +1504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|                 'subtitles':    video_subtitles, |                 'subtitles':    video_subtitles, | ||||||
|                 'duration':     video_duration, |                 'duration':     video_duration, | ||||||
|                 'age_limit':    18 if age_gate else 0, |                 'age_limit':    18 if age_gate else 0, | ||||||
|  |                 'annotations':  video_annotations | ||||||
|             }) |             }) | ||||||
|         return results |         return results | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jai Grimshaw
					Jai Grimshaw