Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity` Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
2026-03-01 03:40:10 +00:00 · 2021-02-24 09:47:53 -05:00
parent c8d83a22ef
commit 310c2ed2c6
13 changed files with 471 additions and 517 deletions
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@@ -364,7 +364,7 @@ class YoutubeDL(object):
    nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
    noresizebuffer, retries, continuedl, noprogress, consoletitle,
    xattr_set_filesize, external_downloader_args, hls_use_mpegts,
-    http_chunk_size.
+    hls_split_discontinuity, http_chunk_size.

    The following options are used by the post processors:
    prefer_ffmpeg:     If False, use avconv instead of ffmpeg if both are available,
--- a/youtube_dlc/init.py
+++ b/youtube_dlc/init.py
@@ -536,6 +536,7 @@ def _real_main(argv=None):
        'ffmpeg_location': opts.ffmpeg_location,
        'hls_prefer_native': opts.hls_prefer_native,
        'hls_use_mpegts': opts.hls_use_mpegts,
+        'hls_split_discontinuity': opts.hls_split_discontinuity,
        'external_downloader_args': opts.external_downloader_args,
        'postprocessor_args': opts.postprocessor_args,
        'cn_verification_proxy': opts.cn_verification_proxy,
--- a/youtube_dlc/downloader/hls.py
+++ b/youtube_dlc/downloader/hls.py
@@ -43,7 +43,6 @@ class HlsFD(FragmentFD):
            # r'#EXT-X-PLAYLIST-TYPE:EVENT',  # media segments may be appended to the end of
            #                                 # event media playlists [4]
            # r'#EXT-X-MAP:',  # media initialization [5]
-            r'^\s*(?:[^#\s]|#EXT-X-MAP:).+?\n\s*#EXT-X-MAP:',  # media initialization [5]
            # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
            # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
            # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
@@ -129,6 +128,7 @@ class HlsFD(FragmentFD):
        skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
        test = self.params.get('test', False)

+        format_index = info_dict.get('format_index')
        extra_query = None
        extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
        if extra_param_to_segment_url:
@@ -138,6 +138,7 @@ class HlsFD(FragmentFD):
        decrypt_info = {'METHOD': 'NONE'}
        key_list = []
        byte_range = {}
+        discontinuity_count = 0
        frag_index = 0
        ad_frag_next = False
        for line in s.splitlines():
@@ -145,6 +146,8 @@ class HlsFD(FragmentFD):
            download_frag = False
            if line:
                if not line.startswith('#'):
+                    if format_index and discontinuity_count != format_index:
+                        continue
                    if ad_frag_next:
                        continue
                    frag_index += 1
@@ -163,6 +166,8 @@ class HlsFD(FragmentFD):
                    download_frag = True

                elif line.startswith('#EXT-X-MAP'):
+                    if format_index and discontinuity_count != format_index:
+                        continue
                    if frag_index > 0:
                        self.report_error(
                            'initialization fragment found after media fragments, unable to download')
@@ -218,6 +223,8 @@ class HlsFD(FragmentFD):
                    ad_frag_next = True
                elif is_ad_fragment_end(line):
                    ad_frag_next = False
+                elif line.startswith('#EXT-X-DISCONTINUITY'):
+                    discontinuity_count += 1

                if download_frag:
                    count = 0
--- a/youtube_dlc/extractor/common.py
+++ b/youtube_dlc/extractor/common.py
@@ -1833,9 +1833,8 @@ class InfoExtractor(object):

    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                              entry_protocol='m3u8', preference=None, quality=None,
-                              m3u8_id=None, note=None, errnote=None,
-                              fatal=True, live=False, data=None, headers={},
-                              query={}):
+                              m3u8_id=None, live=False, note=None, errnote=None,
+                              fatal=True, data=None, headers={}, query={}):
        res = self._download_webpage_handle(
            m3u8_url, video_id,
            note=note or 'Downloading m3u8 information',
@@ -1850,11 +1849,14 @@ class InfoExtractor(object):

        return self._parse_m3u8_formats(
            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
-            preference=preference, quality=quality, m3u8_id=m3u8_id, live=live)
+            preference=preference, quality=quality, m3u8_id=m3u8_id,
+            note=note, errnote=errnote, fatal=fatal, live=live, data=data,
+            headers=headers, query=query, video_id=video_id)

    def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
                            entry_protocol='m3u8', preference=None, quality=None,
-                            m3u8_id=None, live=False):
+                            m3u8_id=None, live=False, note=None, errnote=None,
+                            fatal=True, data=None, headers={}, query={}, video_id=None):
        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
            return []

@@ -1868,6 +1870,8 @@ class InfoExtractor(object):
            if re.match(r'^https?://', u)
            else compat_urlparse.urljoin(m3u8_url, u))

+        split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
+
        # References:
        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
        # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
@@ -1884,15 +1888,67 @@ class InfoExtractor(object):
        # media playlist and MUST NOT appear in master playlist thus we can
        # clearly detect media playlist with this criterion.

+        def _extract_m3u8_playlist_formats(format_url, m3u8_doc=None):
+            if not m3u8_doc:
+                res = self._download_webpage_handle(
+                    format_url, video_id,
+                    note=False,
+                    errnote=errnote or 'Failed to download m3u8 playlist information',
+                    fatal=fatal, data=data, headers=headers, query=query)
+
+                if res is False:
+                    return []
+
+                m3u8_doc, urlh = res
+                format_url = urlh.geturl()
+
+            playlist_formats = []
+            i = (
+                0
+                if split_discontinuity
+                else None)
+            format_info = {
+                'index': i,
+                'key_data': None,
+                'files': [],
+            }
+            for line in m3u8_doc.splitlines():
+                if not line.startswith('#'):
+                    format_info['files'].append(line)
+                elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
+                    i += 1
+                    playlist_formats.append(format_info)
+                    format_info = {
+                        'index': i,
+                        'url': format_url,
+                        'files': [],
+                    }
+            playlist_formats.append(format_info)
+            return playlist_formats
+
        if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
-            return [{
-                'url': m3u8_url,
-                'format_id': m3u8_id,
-                'ext': ext,
-                'protocol': entry_protocol,
-                'preference': preference,
-                'quality': quality,
-            }]
+
+            playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc, True)
+
+            for format in playlist_formats:
+                format_id = []
+                if m3u8_id:
+                    format_id.append(m3u8_id)
+                format_index = format.get('index')
+                if format_index:
+                    format_id.append(str(format_index))
+                f = {
+                    'format_id': '-'.join(format_id),
+                    'format_index': format_index,
+                    'url': m3u8_url,
+                    'ext': ext,
+                    'protocol': entry_protocol,
+                    'preference': preference,
+                    'quality': quality,
+                }
+                formats.append(f)
+
+            return formats

        groups = {}
        last_stream_inf = {}
@@ -1908,23 +1964,31 @@ class InfoExtractor(object):
                return
            media_url = media.get('URI')
            if media_url:
+                manifest_url = format_url(media_url)
                format_id = []
-                for v in (m3u8_id, group_id, name):
-                    if v:
-                        format_id.append(v)
-                f = {
-                    'format_id': '-'.join(format_id),
-                    'url': format_url(media_url),
-                    'manifest_url': m3u8_url,
-                    'language': media.get('LANGUAGE'),
-                    'ext': ext,
-                    'protocol': entry_protocol,
-                    'preference': preference,
-                    'quality': quality,
-                }
-                if media_type == 'AUDIO':
-                    f['vcodec'] = 'none'
-                formats.append(f)
+                playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
+
+                for format in playlist_formats:
+                    format_index = format.get('index')
+                    for v in (m3u8_id, group_id, name):
+                        if v:
+                            format_id.append(v)
+                    if format_index:
+                        format_id.append(str(format_index))
+                    f = {
+                        'format_id': '-'.join(format_id),
+                        'format_index': format_index,
+                        'url': manifest_url,
+                        'manifest_url': m3u8_url,
+                        'language': media.get('LANGUAGE'),
+                        'ext': ext,
+                        'protocol': entry_protocol,
+                        'preference': preference,
+                        'quality': quality,
+                    }
+                    if media_type == 'AUDIO':
+                        f['vcodec'] = 'none'
+                    formats.append(f)

        def build_stream_name():
            # Despite specification does not mention NAME attribute for
@@ -1961,74 +2025,82 @@ class InfoExtractor(object):
                tbr = float_or_none(
                    last_stream_inf.get('AVERAGE-BANDWIDTH')
                    or last_stream_inf.get('BANDWIDTH'), scale=1000)
-                format_id = []
-                if m3u8_id:
-                    format_id.append(m3u8_id)
-                stream_name = build_stream_name()
-                # Bandwidth of live streams may differ over time thus making
-                # format_id unpredictable. So it's better to keep provided
-                # format_id intact.
-                if not live:
-                    format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
                manifest_url = format_url(line.strip())
-                f = {
-                    'format_id': '-'.join(format_id),
-                    'url': manifest_url,
-                    'manifest_url': m3u8_url,
-                    'tbr': tbr,
-                    'ext': ext,
-                    'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
-                    'protocol': entry_protocol,
-                    'preference': preference,
-                    'quality': quality,
-                }
-                resolution = last_stream_inf.get('RESOLUTION')
-                if resolution:
-                    mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
-                    if mobj:
-                        f['width'] = int(mobj.group('width'))
-                        f['height'] = int(mobj.group('height'))
-                # Unified Streaming Platform
-                mobj = re.search(
-                    r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
-                if mobj:
-                    abr, vbr = mobj.groups()
-                    abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
-                    f.update({
-                        'vbr': vbr,
-                        'abr': abr,
-                    })
-                codecs = parse_codecs(last_stream_inf.get('CODECS'))
-                f.update(codecs)
-                audio_group_id = last_stream_inf.get('AUDIO')
-                # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
-                # references a rendition group MUST have a CODECS attribute.
-                # However, this is not always respected, for example, [2]
-                # contains EXT-X-STREAM-INF tag which references AUDIO
-                # rendition group but does not have CODECS and despite
-                # referencing an audio group it represents a complete
-                # (with audio and video) format. So, for such cases we will
-                # ignore references to rendition groups and treat them
-                # as complete formats.
-                if audio_group_id and codecs and f.get('vcodec') != 'none':
-                    audio_group = groups.get(audio_group_id)
-                    if audio_group and audio_group[0].get('URI'):
-                        # TODO: update acodec for audio only formats with
-                        # the same GROUP-ID
-                        f['acodec'] = 'none'
-                formats.append(f)

-                # for DailyMotion
-                progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
-                if progressive_uri:
-                    http_f = f.copy()
-                    del http_f['manifest_url']
-                    http_f.update({
-                        'format_id': f['format_id'].replace('hls-', 'http-'),
-                        'protocol': 'http',
-                        'url': progressive_uri,
-                    })
-                    formats.append(http_f)
+                playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
+
+                for format in playlist_formats:
+                    format_id = []
+                    if m3u8_id:
+                        format_id.append(m3u8_id)
+                    format_index = format.get('index')
+                    stream_name = build_stream_name()
+                    # Bandwidth of live streams may differ over time thus making
+                    # format_id unpredictable. So it's better to keep provided
+                    # format_id intact.
+                    if not live:
+                        format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
+                    if format_index:
+                        format_id.append(str(format_index))
+                    f = {
+                        'format_id': '-'.join(format_id),
+                        'format_index': format_index,
+                        'url': manifest_url,
+                        'manifest_url': m3u8_url,
+                        'tbr': tbr,
+                        'ext': ext,
+                        'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
+                        'protocol': entry_protocol,
+                        'preference': preference,
+                        'quality': quality,
+                    }
+                    resolution = last_stream_inf.get('RESOLUTION')
+                    if resolution:
+                        mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
+                        if mobj:
+                            f['width'] = int(mobj.group('width'))
+                            f['height'] = int(mobj.group('height'))
+                    # Unified Streaming Platform
+                    mobj = re.search(
+                        r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
+                    if mobj:
+                        abr, vbr = mobj.groups()
+                        abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
+                        f.update({
+                            'vbr': vbr,
+                            'abr': abr,
+                        })
+                    codecs = parse_codecs(last_stream_inf.get('CODECS'))
+                    f.update(codecs)
+                    audio_group_id = last_stream_inf.get('AUDIO')
+                    # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+                    # references a rendition group MUST have a CODECS attribute.
+                    # However, this is not always respected, for example, [2]
+                    # contains EXT-X-STREAM-INF tag which references AUDIO
+                    # rendition group but does not have CODECS and despite
+                    # referencing an audio group it represents a complete
+                    # (with audio and video) format. So, for such cases we will
+                    # ignore references to rendition groups and treat them
+                    # as complete formats.
+                    if audio_group_id and codecs and f.get('vcodec') != 'none':
+                        audio_group = groups.get(audio_group_id)
+                        if audio_group and audio_group[0].get('URI'):
+                            # TODO: update acodec for audio only formats with
+                            # the same GROUP-ID
+                            f['acodec'] = 'none'
+                    formats.append(f)
+
+                    # for DailyMotion
+                    progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
+                    if progressive_uri:
+                        http_f = f.copy()
+                        del http_f['manifest_url']
+                        http_f.update({
+                            'format_id': f['format_id'].replace('hls-', 'http-'),
+                            'protocol': 'http',
+                            'url': progressive_uri,
+                        })
+                        formats.append(http_f)

                last_stream_inf = {}
        return formats
--- a/youtube_dlc/options.py
+++ b/youtube_dlc/options.py
@@ -1226,6 +1226,15 @@ def parseOpts(overrideArguments=None):
        '--ignore-dynamic-mpd', '--no-allow-dynamic-mpd',
        action='store_false', dest='dynamic_mpd',
        help='Do not process dynamic DASH manifests (Alias: --no-allow-dynamic-mpd)')
+    extractor.add_option(
+        '--hls-split-discontinuity',
+        dest='hls_split_discontinuity', action='store_true', default=False,
+        help='Split HLS playlists to different formats at discontinuities such as ad breaks'
+    )
+    extractor.add_option(
+        '--no-hls-split-discontinuity',
+        dest='hls_split_discontinuity', action='store_false',
+        help='Do not split HLS playlists to different formats at discontinuities such as ad breaks (default)')

    parser.add_option_group(general)
    parser.add_option_group(network)