1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2026-03-01 03:40:10 +00:00

Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
This commit is contained in:
shirt-dev
2021-02-24 09:47:53 -05:00
committed by GitHub
parent c8d83a22ef
commit 310c2ed2c6
13 changed files with 471 additions and 517 deletions

View File

@@ -364,7 +364,7 @@ class YoutubeDL(object):
nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
noresizebuffer, retries, continuedl, noprogress, consoletitle,
xattr_set_filesize, external_downloader_args, hls_use_mpegts,
http_chunk_size.
hls_split_discontinuity, http_chunk_size.
The following options are used by the post processors:
prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,

View File

@@ -536,6 +536,7 @@ def _real_main(argv=None):
'ffmpeg_location': opts.ffmpeg_location,
'hls_prefer_native': opts.hls_prefer_native,
'hls_use_mpegts': opts.hls_use_mpegts,
'hls_split_discontinuity': opts.hls_split_discontinuity,
'external_downloader_args': opts.external_downloader_args,
'postprocessor_args': opts.postprocessor_args,
'cn_verification_proxy': opts.cn_verification_proxy,

View File

@@ -43,7 +43,6 @@ class HlsFD(FragmentFD):
# r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of
# # event media playlists [4]
# r'#EXT-X-MAP:', # media initialization [5]
r'^\s*(?:[^#\s]|#EXT-X-MAP:).+?\n\s*#EXT-X-MAP:', # media initialization [5]
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
# 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
# 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
@@ -129,6 +128,7 @@ class HlsFD(FragmentFD):
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
test = self.params.get('test', False)
format_index = info_dict.get('format_index')
extra_query = None
extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
if extra_param_to_segment_url:
@@ -138,6 +138,7 @@ class HlsFD(FragmentFD):
decrypt_info = {'METHOD': 'NONE'}
key_list = []
byte_range = {}
discontinuity_count = 0
frag_index = 0
ad_frag_next = False
for line in s.splitlines():
@@ -145,6 +146,8 @@ class HlsFD(FragmentFD):
download_frag = False
if line:
if not line.startswith('#'):
if format_index and discontinuity_count != format_index:
continue
if ad_frag_next:
continue
frag_index += 1
@@ -163,6 +166,8 @@ class HlsFD(FragmentFD):
download_frag = True
elif line.startswith('#EXT-X-MAP'):
if format_index and discontinuity_count != format_index:
continue
if frag_index > 0:
self.report_error(
'initialization fragment found after media fragments, unable to download')
@@ -218,6 +223,8 @@ class HlsFD(FragmentFD):
ad_frag_next = True
elif is_ad_fragment_end(line):
ad_frag_next = False
elif line.startswith('#EXT-X-DISCONTINUITY'):
discontinuity_count += 1
if download_frag:
count = 0

View File

@@ -1833,9 +1833,8 @@ class InfoExtractor(object):
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None, quality=None,
m3u8_id=None, note=None, errnote=None,
fatal=True, live=False, data=None, headers={},
query={}):
m3u8_id=None, live=False, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
@@ -1850,11 +1849,14 @@ class InfoExtractor(object):
return self._parse_m3u8_formats(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
preference=preference, quality=quality, m3u8_id=m3u8_id, live=live)
preference=preference, quality=quality, m3u8_id=m3u8_id,
note=note, errnote=errnote, fatal=fatal, live=live, data=data,
headers=headers, query=query, video_id=video_id)
def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
entry_protocol='m3u8', preference=None, quality=None,
m3u8_id=None, live=False):
m3u8_id=None, live=False, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}, video_id=None):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
@@ -1868,6 +1870,8 @@ class InfoExtractor(object):
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
# References:
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
# 2. https://github.com/ytdl-org/youtube-dl/issues/12211
@@ -1884,15 +1888,67 @@ class InfoExtractor(object):
# media playlist and MUST NOT appear in master playlist thus we can
# clearly detect media playlist with this criterion.
def _extract_m3u8_playlist_formats(format_url, m3u8_doc=None):
if not m3u8_doc:
res = self._download_webpage_handle(
format_url, video_id,
note=False,
errnote=errnote or 'Failed to download m3u8 playlist information',
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return []
m3u8_doc, urlh = res
format_url = urlh.geturl()
playlist_formats = []
i = (
0
if split_discontinuity
else None)
format_info = {
'index': i,
'key_data': None,
'files': [],
}
for line in m3u8_doc.splitlines():
if not line.startswith('#'):
format_info['files'].append(line)
elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
i += 1
playlist_formats.append(format_info)
format_info = {
'index': i,
'url': format_url,
'files': [],
}
playlist_formats.append(format_info)
return playlist_formats
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
'format_id': m3u8_id,
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
}]
playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc, True)
for format in playlist_formats:
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
format_index = format.get('index')
if format_index:
format_id.append(str(format_index))
f = {
'format_id': '-'.join(format_id),
'format_index': format_index,
'url': m3u8_url,
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
}
formats.append(f)
return formats
groups = {}
last_stream_inf = {}
@@ -1908,23 +1964,31 @@ class InfoExtractor(object):
return
media_url = media.get('URI')
if media_url:
manifest_url = format_url(media_url)
format_id = []
for v in (m3u8_id, group_id, name):
if v:
format_id.append(v)
f = {
'format_id': '-'.join(format_id),
'url': format_url(media_url),
'manifest_url': m3u8_url,
'language': media.get('LANGUAGE'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
}
if media_type == 'AUDIO':
f['vcodec'] = 'none'
formats.append(f)
playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
for format in playlist_formats:
format_index = format.get('index')
for v in (m3u8_id, group_id, name):
if v:
format_id.append(v)
if format_index:
format_id.append(str(format_index))
f = {
'format_id': '-'.join(format_id),
'format_index': format_index,
'url': manifest_url,
'manifest_url': m3u8_url,
'language': media.get('LANGUAGE'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
}
if media_type == 'AUDIO':
f['vcodec'] = 'none'
formats.append(f)
def build_stream_name():
# Despite specification does not mention NAME attribute for
@@ -1961,74 +2025,82 @@ class InfoExtractor(object):
tbr = float_or_none(
last_stream_inf.get('AVERAGE-BANDWIDTH')
or last_stream_inf.get('BANDWIDTH'), scale=1000)
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
if not live:
format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
manifest_url = format_url(line.strip())
f = {
'format_id': '-'.join(format_id),
'url': manifest_url,
'manifest_url': m3u8_url,
'tbr': tbr,
'ext': ext,
'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
}
resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
if mobj:
f['width'] = int(mobj.group('width'))
f['height'] = int(mobj.group('height'))
# Unified Streaming Platform
mobj = re.search(
r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
if mobj:
abr, vbr = mobj.groups()
abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
f.update({
'vbr': vbr,
'abr': abr,
})
codecs = parse_codecs(last_stream_inf.get('CODECS'))
f.update(codecs)
audio_group_id = last_stream_inf.get('AUDIO')
# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
# references a rendition group MUST have a CODECS attribute.
# However, this is not always respected, for example, [2]
# contains EXT-X-STREAM-INF tag which references AUDIO
# rendition group but does not have CODECS and despite
# referencing an audio group it represents a complete
# (with audio and video) format. So, for such cases we will
# ignore references to rendition groups and treat them
# as complete formats.
if audio_group_id and codecs and f.get('vcodec') != 'none':
audio_group = groups.get(audio_group_id)
if audio_group and audio_group[0].get('URI'):
# TODO: update acodec for audio only formats with
# the same GROUP-ID
f['acodec'] = 'none'
formats.append(f)
# for DailyMotion
progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
if progressive_uri:
http_f = f.copy()
del http_f['manifest_url']
http_f.update({
'format_id': f['format_id'].replace('hls-', 'http-'),
'protocol': 'http',
'url': progressive_uri,
})
formats.append(http_f)
playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
for format in playlist_formats:
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
format_index = format.get('index')
stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
if not live:
format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
if format_index:
format_id.append(str(format_index))
f = {
'format_id': '-'.join(format_id),
'format_index': format_index,
'url': manifest_url,
'manifest_url': m3u8_url,
'tbr': tbr,
'ext': ext,
'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
}
resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
if mobj:
f['width'] = int(mobj.group('width'))
f['height'] = int(mobj.group('height'))
# Unified Streaming Platform
mobj = re.search(
r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
if mobj:
abr, vbr = mobj.groups()
abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
f.update({
'vbr': vbr,
'abr': abr,
})
codecs = parse_codecs(last_stream_inf.get('CODECS'))
f.update(codecs)
audio_group_id = last_stream_inf.get('AUDIO')
# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
# references a rendition group MUST have a CODECS attribute.
# However, this is not always respected, for example, [2]
# contains EXT-X-STREAM-INF tag which references AUDIO
# rendition group but does not have CODECS and despite
# referencing an audio group it represents a complete
# (with audio and video) format. So, for such cases we will
# ignore references to rendition groups and treat them
# as complete formats.
if audio_group_id and codecs and f.get('vcodec') != 'none':
audio_group = groups.get(audio_group_id)
if audio_group and audio_group[0].get('URI'):
# TODO: update acodec for audio only formats with
# the same GROUP-ID
f['acodec'] = 'none'
formats.append(f)
# for DailyMotion
progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
if progressive_uri:
http_f = f.copy()
del http_f['manifest_url']
http_f.update({
'format_id': f['format_id'].replace('hls-', 'http-'),
'protocol': 'http',
'url': progressive_uri,
})
formats.append(http_f)
last_stream_inf = {}
return formats

View File

@@ -1226,6 +1226,15 @@ def parseOpts(overrideArguments=None):
'--ignore-dynamic-mpd', '--no-allow-dynamic-mpd',
action='store_false', dest='dynamic_mpd',
help='Do not process dynamic DASH manifests (Alias: --no-allow-dynamic-mpd)')
extractor.add_option(
'--hls-split-discontinuity',
dest='hls_split_discontinuity', action='store_true', default=False,
help='Split HLS playlists to different formats at discontinuities such as ad breaks'
)
extractor.add_option(
'--no-hls-split-discontinuity',
dest='hls_split_discontinuity', action='store_false',
help='Do not split HLS playlists to different formats at discontinuities such as ad breaks (default)')
parser.add_option_group(general)
parser.add_option_group(network)