1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-11-13 13:05:13 +00:00

[ie/web.archive:youtube] Fix extractor (#14753)

Closes #14681, Closes #14741
Authored by: seproDev
This commit is contained in:
sepro
2025-10-25 12:11:00 +02:00
committed by GitHub
parent 70f1098312
commit d9e3011fd1
3 changed files with 160 additions and 250 deletions

View File

@@ -13,12 +13,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import contextlib import contextlib
import copy import copy
import itertools
import json import json
from test.helper import FakeYDL, assertRegexpMatches, try_rm from test.helper import FakeYDL, assertRegexpMatches, try_rm
from yt_dlp import YoutubeDL from yt_dlp import YoutubeDL
from yt_dlp.extractor import YoutubeIE
from yt_dlp.extractor.common import InfoExtractor from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.postprocessor.common import PostProcessor from yt_dlp.postprocessor.common import PostProcessor
from yt_dlp.utils import ( from yt_dlp.utils import (
@@ -337,99 +335,6 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL({'format': '[format_id!*=-]'}) ydl = YDL({'format': '[format_id!*=-]'})
self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
def test_youtube_format_selection(self):
# FIXME: Rewrite in accordance with the new format sorting options
return
order = [
'38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13',
# Apple HTTP Live Streaming
'96', '95', '94', '93', '92', '132', '151',
# 3D
'85', '84', '102', '83', '101', '82', '100',
# Dash video
'137', '248', '136', '247', '135', '246',
'245', '244', '134', '243', '133', '242', '160',
# Dash audio
'141', '172', '140', '171', '139',
]
def format_info(f_id):
info = YoutubeIE._formats[f_id].copy()
# XXX: In real cases InfoExtractor._parse_mpd_formats() fills up 'acodec'
# and 'vcodec', while in tests such information is incomplete since
# commit a6c2c24479e5f4827ceb06f64d855329c0a6f593
# test_YoutubeDL.test_youtube_format_selection is broken without
# this fix
if 'acodec' in info and 'vcodec' not in info:
info['vcodec'] = 'none'
elif 'vcodec' in info and 'acodec' not in info:
info['acodec'] = 'none'
info['format_id'] = f_id
info['url'] = 'url:' + f_id
return info
formats_order = [format_info(f_id) for f_id in order]
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': 'bestvideo+bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], '248+172')
self.assertEqual(downloaded['ext'], 'mp4')
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': 'bestvideo[height>=999999]+bestaudio/best'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], '38')
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': 'bestvideo/best,bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
self.assertEqual(downloaded_ids, ['137', '141'])
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
self.assertEqual(downloaded_ids, ['137+141', '248+141'])
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
self.assertEqual(downloaded_ids, ['136+141', '247+141'])
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
self.assertEqual(downloaded_ids, ['248+141'])
for f1, f2 in itertools.pairwise(formats_order):
info_dict = _make_result([f1, f2], extractor='youtube')
ydl = YDL({'format': 'best/bestvideo'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1['format_id'])
info_dict = _make_result([f2, f1], extractor='youtube')
ydl = YDL({'format': 'best/bestvideo'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1['format_id'])
def test_audio_only_extractor_format_selection(self): def test_audio_only_extractor_format_selection(self):
# For extractors with incomplete formats (all formats are audio-only or # For extractors with incomplete formats (all formats are audio-only or
# video-only) best and worst should fallback to corresponding best/worst # video-only) best and worst should fallback to corresponding best/worst

View File

@@ -5,12 +5,9 @@ import re
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeBaseInfoExtractor, YoutubeIE from .youtube import YoutubeBaseInfoExtractor
from ..networking import HEADRequest
from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
KNOWN_EXTENSIONS, KNOWN_EXTENSIONS,
ExtractorError,
bug_reports_message, bug_reports_message,
clean_html, clean_html,
dict_get, dict_get,
@@ -21,18 +18,14 @@ from ..utils import (
join_nonempty, join_nonempty,
js_to_json, js_to_json,
merge_dicts, merge_dicts,
mimetype2ext,
orderedSet, orderedSet,
parse_duration, parse_duration,
parse_qs, parse_qs,
str_or_none, str_or_none,
str_to_int,
traverse_obj, traverse_obj,
try_get,
unified_strdate, unified_strdate,
unified_timestamp, unified_timestamp,
url_or_none, url_or_none,
urlhandle_detect_ext,
) )
@@ -471,7 +464,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
'info_dict': { 'info_dict': {
'id': 'lTx3G6h2xyA', 'id': 'lTx3G6h2xyA',
'ext': 'flv', 'ext': 'mp4',
'title': 'Madeon - Pop Culture (live mashup)', 'title': 'Madeon - Pop Culture (live mashup)',
'upload_date': '20110711', 'upload_date': '20110711',
'uploader': 'Madeon', 'uploader': 'Madeon',
@@ -578,7 +571,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc', 'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc',
'info_dict': { 'info_dict': {
'id': 'Q_yjX80U7Yc', 'id': 'Q_yjX80U7Yc',
'ext': 'flv', 'ext': 'webm',
'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest', 'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest',
'uploader_id': 'claybutlermusic', 'uploader_id': 'claybutlermusic',
'description': 'md5:4595264559e3d0a0ceb3f011f6334543', 'description': 'md5:4595264559e3d0a0ceb3f011f6334543',
@@ -680,6 +673,37 @@ class YoutubeWebArchiveIE(InfoExtractor):
'upload_date': '20120407', 'upload_date': '20120407',
'uploader_id': 'thecomputernerd01', 'uploader_id': 'thecomputernerd01',
}, },
}, {
# Contains split audio/video formats
'url': 'ytarchive:o_T_S_TU12M',
'info_dict': {
'id': 'o_T_S_TU12M',
'ext': 'mp4',
'title': 'Prairie Pulse 1218; Lin Enger, Paul Olson',
'description': 'md5:36e7a34cdc8508e35a920ec042e799c7',
'uploader': 'Prairie Public',
'channel_id': 'UC4BOzQel6tvJm7OEDd3vZlw',
'channel_url': 'https://www.youtube.com/channel/UC4BOzQel6tvJm7OEDd3vZlw',
'duration': 1606,
'upload_date': '20150213',
},
}, {
# Video unavailable through wayback-fakeurl
'url': 'ytarchive:SQCom7wjGDs',
'info_dict': {
'id': 'SQCom7wjGDs',
'ext': 'mp4',
'title': 'Jamin Warren from PBS Game/Show decides that Portal is a feminist Game [Top Hats and No Brain]',
'description': 'md5:c0cb876dd075483ead9afcc86798efb0',
'uploader': 'Top Hats and Champagne',
'uploader_id': 'sparrowtm',
'uploader_url': 'https://www.youtube.com/user/sparrowtm',
'channel_id': 'UCW3T5nG4iEkI7HjG-Du3HQA',
'channel_url': 'https://www.youtube.com/channel/UCW3T5nG4iEkI7HjG-Du3HQA',
'duration': 1500,
'thumbnail': 'https://web.archive.org/web/20160108040020if_/https://i.ytimg.com/vi/SQCom7wjGDs/maxresdefault.jpg',
'upload_date': '20160107',
},
}, { }, {
'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
'only_matching': True, 'only_matching': True,
@@ -724,6 +748,113 @@ class YoutubeWebArchiveIE(InfoExtractor):
_OLDEST_CAPTURE_DATE = 20050214000000 _OLDEST_CAPTURE_DATE = 20050214000000
_NEWEST_CAPTURE_DATE = 20500101000000 _NEWEST_CAPTURE_DATE = 20500101000000
_FORMATS = {
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'vcodec': 'h263'},
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'vcodec': 'h263'},
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'vcodec': 'mp4v'},
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'vcodec': 'h264'},
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'vcodec': 'h264'},
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'vcodec': 'h264'},
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'},
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'vcodec': 'h264'},
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'vcodec': 'h264'},
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'vcodec': 'vp8'},
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'vcodec': 'vp8'},
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'vcodec': 'vp8'},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'vcodec': 'vp8'},
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'},
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'},
# 3D videos
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20},
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20},
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20},
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20},
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20},
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20},
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20},
# Apple HTTP Live Streaming
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
# DASH mp4 video
'133': {'ext': 'mp4', 'height': 240, 'vcodec': 'h264', 'acodec': 'none'},
'134': {'ext': 'mp4', 'height': 360, 'vcodec': 'h264', 'acodec': 'none'},
'135': {'ext': 'mp4', 'height': 480, 'vcodec': 'h264', 'acodec': 'none'},
'136': {'ext': 'mp4', 'height': 720, 'vcodec': 'h264', 'acodec': 'none'},
'137': {'ext': 'mp4', 'height': 1080, 'vcodec': 'h264', 'acodec': 'none'},
'138': {'ext': 'mp4', 'vcodec': 'h264', 'acodec': 'none'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
'160': {'ext': 'mp4', 'height': 144, 'vcodec': 'h264', 'acodec': 'none'},
'212': {'ext': 'mp4', 'height': 480, 'vcodec': 'h264', 'acodec': 'none'},
'264': {'ext': 'mp4', 'height': 1440, 'vcodec': 'h264', 'acodec': 'none'},
'298': {'ext': 'mp4', 'height': 720, 'vcodec': 'h264', 'fps': 60, 'acodec': 'none'},
'299': {'ext': 'mp4', 'height': 1080, 'vcodec': 'h264', 'fps': 60, 'acodec': 'none'},
'266': {'ext': 'mp4', 'height': 2160, 'vcodec': 'h264', 'acodec': 'none'},
# Dash mp4 audio
'139': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'140': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'141': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'256': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'258': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'325': {'ext': 'm4a', 'acodec': 'dtse', 'vcodec': 'none'},
'328': {'ext': 'm4a', 'acodec': 'ec-3', 'vcodec': 'none'},
# Dash webm
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'vcodec': 'vp8'},
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'},
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'vcodec': 'vp8'},
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'vcodec': 'vp8'},
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'},
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'},
'278': {'ext': 'webm', 'height': 144, 'vcodec': 'vp9', 'acodec': 'none'},
'242': {'ext': 'webm', 'height': 240, 'vcodec': 'vp9', 'acodec': 'none'},
'243': {'ext': 'webm', 'height': 360, 'vcodec': 'vp9', 'acodec': 'none'},
'244': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'},
'245': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'},
'246': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'},
'247': {'ext': 'webm', 'height': 720, 'vcodec': 'vp9', 'acodec': 'none'},
'248': {'ext': 'webm', 'height': 1080, 'vcodec': 'vp9', 'acodec': 'none'},
'271': {'ext': 'webm', 'height': 1440, 'vcodec': 'vp9', 'acodec': 'none'},
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
'272': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'acodec': 'none'},
'302': {'ext': 'webm', 'height': 720, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'},
'303': {'ext': 'webm', 'height': 1080, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'},
'308': {'ext': 'webm', 'height': 1440, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'},
'313': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'acodec': 'none'},
'315': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'},
# Dash webm audio
'171': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none'},
'172': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none'},
# Dash webm audio with opus inside
'249': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'},
'250': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'},
'251': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'},
# av01 video only formats sometimes served with "unknown" codecs
'394': {'ext': 'mp4', 'height': 144, 'vcodec': 'av01.0.00M.08', 'acodec': 'none'},
'395': {'ext': 'mp4', 'height': 240, 'vcodec': 'av01.0.00M.08', 'acodec': 'none'},
'396': {'ext': 'mp4', 'height': 360, 'vcodec': 'av01.0.01M.08', 'acodec': 'none'},
'397': {'ext': 'mp4', 'height': 480, 'vcodec': 'av01.0.04M.08', 'acodec': 'none'},
'398': {'ext': 'mp4', 'height': 720, 'vcodec': 'av01.0.05M.08', 'acodec': 'none'},
'399': {'ext': 'mp4', 'height': 1080, 'vcodec': 'av01.0.08M.08', 'acodec': 'none'},
'400': {'ext': 'mp4', 'height': 1440, 'vcodec': 'av01.0.12M.08', 'acodec': 'none'},
'401': {'ext': 'mp4', 'height': 2160, 'vcodec': 'av01.0.12M.08', 'acodec': 'none'},
}
def _call_cdx_api(self, item_id, url, filters: list | None = None, collapse: list | None = None, query: dict | None = None, note=None, fatal=False): def _call_cdx_api(self, item_id, url, filters: list | None = None, collapse: list | None = None, query: dict | None = None, note=None, fatal=False):
# CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
query = { query = {
@@ -933,23 +1064,13 @@ class YoutubeWebArchiveIE(InfoExtractor):
video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2') video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
url_date = url_date or url_date_2 url_date = url_date or url_date_2
urlh = None video_info = self._download_json(
retry_manager = self.RetryManager(fatal=False) 'https://web.archive.org/__wb/videoinfo', video_id,
for retry in retry_manager: query={'vtype': 'youtube', 'vid': video_id})
try:
urlh = self._request_webpage(
HEADRequest(f'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{video_id}'),
video_id, note='Fetching archived video file url', expected_status=True)
except ExtractorError as e:
# HTTP Error 404 is expected if the video is not saved.
if isinstance(e.cause, HTTPError) and e.cause.status == 404:
self.raise_no_formats(
'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True)
else:
retry.error = e
if retry_manager.error: if not traverse_obj(video_info, 'formats'):
self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id) self.raise_no_formats(
'The requested video is not archived or indexed', expected=True)
capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
@@ -968,25 +1089,18 @@ class YoutubeWebArchiveIE(InfoExtractor):
info['thumbnails'] = self._extract_thumbnails(video_id) info['thumbnails'] = self._extract_thumbnails(video_id)
if urlh: formats = []
url = urllib.parse.unquote(urlh.url) for fmt in traverse_obj(video_info, ('formats', lambda _, v: url_or_none(v['url']))):
video_file_url_qs = parse_qs(url) format_id = traverse_obj(fmt, ('url', {parse_qs}, 'itag', 0))
# Attempt to recover any ext & format info from playback url & response headers formats.append({
fmt = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} 'format_id': format_id,
itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) **self._FORMATS.get(format_id, {}),
if itag and itag in YoutubeIE._formats: **traverse_obj(fmt, {
fmt.update(YoutubeIE._formats[itag]) 'url': ('url', {lambda x: f'https://web.archive.org/web/2id_/{x}'}),
fmt.update({'format_id': itag}) 'ext': ('ext', {str}),
else: 'filesize': ('url', {parse_qs}, 'clen', 0, {int_or_none}),
mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) }),
ext = (mimetype2ext(mime) })
or urlhandle_detect_ext(urlh) info['formats'] = formats
or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type')))
fmt.update({'ext': ext})
info['formats'] = [fmt]
if not info.get('duration'):
info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
if not info.get('title'):
info['title'] = video_id
return info return info

View File

@@ -147,115 +147,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
) )
_formats = { # NB: Used in YoutubeWebArchiveIE and GoogleDriveIE
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
# 3D videos
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
# Apple HTTP Live Streaming
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
# DASH mp4 video
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
# Dash mp4 audio
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
# Dash webm
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
# Dash webm audio
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
# Dash webm audio with opus inside
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
# RTMP (unnamed)
'_rtmp': {'protocol': 'rtmp'},
# av01 video only formats sometimes served with "unknown" codecs
'394': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
'395': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
'396': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'av01.0.01M.08'},
'397': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'av01.0.04M.08'},
'398': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'av01.0.05M.08'},
'399': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'av01.0.08M.08'},
'400': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
'401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
}
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'srt', 'vtt') _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'srt', 'vtt')
_DEFAULT_CLIENTS = ('android_sdkless', 'tv', 'web_safari', 'web') _DEFAULT_CLIENTS = ('android_sdkless', 'tv', 'web_safari', 'web')
_DEFAULT_AUTHED_CLIENTS = ('tv', 'web_safari', 'web') _DEFAULT_AUTHED_CLIENTS = ('tv', 'web_safari', 'web')