Merge branch 'master' into yt-live-from-start-range

2025-12-17 13:38:55 +00:00 · 2024-10-09 11:23:14 -05:00
parent 160d973aee 983c58fb7a
commit 9dd8574b68
109 changed files with 2392 additions and 757 deletions
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -35,6 +35,7 @@ from ..networking import HEADRequest, Request
 from ..networking.exceptions import (
    HTTPError,
    IncompleteRead,
+    TransportError,
    network_exceptions,
 )
 from ..networking.impersonate import ImpersonateTarget
@@ -965,6 +966,9 @@ class InfoExtractor:
            return False
        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
                                             encoding=encoding, data=data)
+        if content is False:
+            assert not fatal
+            return False
        return (content, urlh)

    @staticmethod
@@ -1039,7 +1043,15 @@ class InfoExtractor:

    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
                              prefix=None, encoding=None, data=None):
-        webpage_bytes = urlh.read()
+        try:
+            webpage_bytes = urlh.read()
+        except TransportError as err:
+            errmsg = f'{video_id}: Error reading response: {err.msg}'
+            if fatal:
+                raise ExtractorError(errmsg, cause=err)
+            self.report_warning(errmsg)
+            return False
+
        if prefix is not None:
            webpage_bytes = prefix + webpage_bytes
        if self.get_param('dump_intermediate_pages', False):
@@ -1698,7 +1710,7 @@ class InfoExtractor:
                rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
                if rating is not None:
                    info['average_rating'] = rating
-                if is_type(e, 'TVEpisode', 'Episode'):
+                if is_type(e, 'TVEpisode', 'Episode', 'PodcastEpisode'):
                    episode_name = unescapeHTML(e.get('name'))
                    info.update({
                        'episode': episode_name,
@@ -2065,7 +2077,7 @@ class InfoExtractor:
        has_drm = HlsFD._has_drm(m3u8_doc)

        def format_url(url):
-            return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
+            return url if re.match(r'https?://', url) else urllib.parse.urljoin(m3u8_url, url)

        if self.get_param('hls_split_discontinuity', False):
            def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
@@ -2806,11 +2818,11 @@ class InfoExtractor:
                        base_url_e = element.find(_add_ns('BaseURL'))
                        if try_call(lambda: base_url_e.text) is not None:
                            base_url = base_url_e.text + base_url
-                            if re.match(r'^https?://', base_url):
+                            if re.match(r'https?://', base_url):
                                break
                    if mpd_base_url and base_url.startswith('/'):
                        base_url = urllib.parse.urljoin(mpd_base_url, base_url)
-                    elif mpd_base_url and not re.match(r'^https?://', base_url):
+                    elif mpd_base_url and not re.match(r'https?://', base_url):
                        if not mpd_base_url.endswith('/'):
                            mpd_base_url += '/'
                        base_url = mpd_base_url + base_url
@@ -2900,7 +2912,7 @@ class InfoExtractor:
                        }

                    def location_key(location):
-                        return 'url' if re.match(r'^https?://', location) else 'path'
+                        return 'url' if re.match(r'https?://', location) else 'path'

                    if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:

@@ -3503,7 +3515,7 @@ class InfoExtractor:
                continue
            urls.add(source_url)
            source_type = source.get('type') or ''
-            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            ext = determine_ext(source_url, default_ext=mimetype2ext(source_type))
            if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
                formats.extend(self._extract_m3u8_formats(
                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',