mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[youtube] Adds #1312 Download annotations
Adds #1321 Download annotations from youtube Annotations are downloaded and written to a .annotations.xml file using the https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=$VIDEOID API. Added unit test for annotations.
This commit is contained in:
		
							
								
								
									
										82
									
								
								test/test_write_annotations.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								test/test_write_annotations.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | ||||
| #!/usr/bin/env python | ||||
| # coding: utf-8 | ||||
|  | ||||
| import xml.etree.ElementTree | ||||
| import os | ||||
| import sys | ||||
| import unittest | ||||
|  | ||||
| # Allow direct execution | ||||
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||||
|  | ||||
| import youtube_dl.YoutubeDL | ||||
| import youtube_dl.extractor | ||||
| from youtube_dl.utils import * | ||||
| from .helper import try_rm | ||||
|  | ||||
| PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") | ||||
|  | ||||
| # General configuration (from __init__, not very elegant...) | ||||
| jar = compat_cookiejar.CookieJar() | ||||
| cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) | ||||
| proxy_handler = compat_urllib_request.ProxyHandler() | ||||
| opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) | ||||
| compat_urllib_request.install_opener(opener) | ||||
|  | ||||
| class YoutubeDL(youtube_dl.YoutubeDL): | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         super(YoutubeDL, self).__init__(*args, **kwargs) | ||||
|         self.to_stderr = self.to_screen | ||||
|  | ||||
| with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: | ||||
|     params = json.load(pf) | ||||
| params['writeannotations'] = True | ||||
| params['skip_download'] = True | ||||
| params['writeinfojson'] = False | ||||
| params['format'] = 'flv' | ||||
|  | ||||
| TEST_ID = 'gr51aVj-mLg' | ||||
| ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' | ||||
| EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] | ||||
|  | ||||
| class TestAnnotations(unittest.TestCase): | ||||
|     def setUp(self): | ||||
|         # Clear old files | ||||
|         self.tearDown() | ||||
|  | ||||
|  | ||||
|     def test_info_json(self): | ||||
|         expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text. | ||||
|         ie = youtube_dl.extractor.YoutubeIE() | ||||
|         ydl = YoutubeDL(params) | ||||
|         ydl.add_info_extractor(ie) | ||||
|         ydl.download([TEST_ID]) | ||||
|         self.assertTrue(os.path.exists(ANNOTATIONS_FILE)) | ||||
|         annoxml = None | ||||
|         with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof: | ||||
|                 annoxml = xml.etree.ElementTree.parse(annof) | ||||
|         self.assertTrue(annoxml is not None, 'Failed to parse annotations XML') | ||||
|         root = annoxml.getroot() | ||||
|         self.assertEqual(root.tag, 'document') | ||||
|         annotationsTag = root.find('annotations') | ||||
|         self.assertEqual(annotationsTag.tag, 'annotations') | ||||
|         annotations = annotationsTag.findall('annotation') | ||||
|  | ||||
|         #Not all the annotations have TEXT children and the annotations are returned unsorted. | ||||
|         for a in annotations: | ||||
|                 self.assertEqual(a.tag, 'annotation') | ||||
|                 if a.get('type') == 'text': | ||||
|                         textTag = a.find('TEXT') | ||||
|                         text = textTag.text | ||||
|                         self.assertTrue(text in expected) #assertIn only added in python 2.7 | ||||
|                         #remove the first occurance, there could be more than one annotation with the same text | ||||
|                         expected.remove(text) | ||||
|         #We should have seen (and removed) all the expected annotation texts. | ||||
|         self.assertEqual(len(expected), 0, 'Not all expected annotations were found.') | ||||
|          | ||||
|  | ||||
|     def tearDown(self): | ||||
|         try_rm(ANNOTATIONS_FILE) | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     unittest.main() | ||||
| @@ -71,6 +71,7 @@ class YoutubeDL(object): | ||||
|     logtostderr:       Log messages to stderr instead of stdout. | ||||
|     writedescription:  Write the video description to a .description file | ||||
|     writeinfojson:     Write the video description to a .info.json file | ||||
|     writeannotations:  Write the video annotations to a .annotations.xml file | ||||
|     writethumbnail:    Write the thumbnail image to a file | ||||
|     writesubtitles:    Write the video subtitles to a file | ||||
|     writeautomaticsub: Write the automatic subtitles to a file | ||||
| @@ -258,6 +259,10 @@ class YoutubeDL(object): | ||||
|         """ Report that the metadata file has been written """ | ||||
|         self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) | ||||
|  | ||||
|     def report_writeannotations(self, annofn): | ||||
|         """ Report that the annotations file has been written. """ | ||||
|         self.to_screen(u'[info] Writing video annotations to: ' + annofn) | ||||
|  | ||||
|     def report_file_already_downloaded(self, file_name): | ||||
|         """Report file has already been fully downloaded.""" | ||||
|         try: | ||||
| @@ -522,6 +527,18 @@ class YoutubeDL(object): | ||||
|                 self.report_error(u'Cannot write description file ' + descfn) | ||||
|                 return | ||||
|  | ||||
|         if self.params.get('writeannotations', False): | ||||
|             try: | ||||
|                annofn = filename + u'.annotations.xml' | ||||
|                self.report_writeannotations(annofn) | ||||
|                with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: | ||||
|                    annofile.write(info_dict['annotations']) | ||||
|             except (KeyError, TypeError): | ||||
|                 self.report_warning(u'There are no annotations to write.') | ||||
|             except (OSError, IOError): | ||||
|                  self.report_error(u'Cannot write annotations file: ' + annofn) | ||||
|                  return | ||||
|  | ||||
|         subtitles_are_requested = any([self.params.get('writesubtitles', False), | ||||
|                                        self.params.get('writeautomaticsub')]) | ||||
|  | ||||
|   | ||||
| @@ -339,6 +339,9 @@ def parseOpts(overrideArguments=None): | ||||
|     filesystem.add_option('--write-info-json', | ||||
|             action='store_true', dest='writeinfojson', | ||||
|             help='write video metadata to a .info.json file', default=False) | ||||
|     filesystem.add_option('--write-annotations', | ||||
|             action='store_true', dest='writeannotations', | ||||
|             help='write video annotations to a .annotation file', default=False) | ||||
|     filesystem.add_option('--write-thumbnail', | ||||
|             action='store_true', dest='writethumbnail', | ||||
|             help='write thumbnail image to disk', default=False) | ||||
| @@ -601,6 +604,7 @@ def _real_main(argv=None): | ||||
|         'nopart': opts.nopart, | ||||
|         'updatetime': opts.updatetime, | ||||
|         'writedescription': opts.writedescription, | ||||
|         'writeannotations': opts.writeannotations, | ||||
|         'writeinfojson': opts.writeinfojson, | ||||
|         'writethumbnail': opts.writethumbnail, | ||||
|         'writesubtitles': opts.writesubtitles, | ||||
|   | ||||
| @@ -1250,6 +1250,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | ||||
|             url_map[itag] = format_url | ||||
|         return url_map | ||||
|  | ||||
|     def _extract_annotations(self, video_id): | ||||
|         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id | ||||
|         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         # Extract original video URL from URL with redirection, like age verification, using next_url parameter | ||||
|         mobj = re.search(self._NEXT_URL_RE, url) | ||||
| @@ -1382,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | ||||
|         else: | ||||
|             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) | ||||
|  | ||||
|         # annotations | ||||
|         video_annotations = None | ||||
|         if self._downloader.params.get('writeannotations', False): | ||||
|                 video_annotations = self._extract_annotations(video_id) | ||||
|  | ||||
|         # Decide which formats to download | ||||
|  | ||||
|         try: | ||||
| @@ -1495,6 +1504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | ||||
|                 'subtitles':    video_subtitles, | ||||
|                 'duration':     video_duration, | ||||
|                 'age_limit':    18 if age_gate else 0, | ||||
|                 'annotations':  video_annotations | ||||
|             }) | ||||
|         return results | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jai Grimshaw
					Jai Grimshaw