Merge 744d239dd7 into 06c1a8cdff

2025-06-28 01:18:30 +00:00 · 2025-06-26 19:34:49 -07:00 · 2025-06-26 19:34:49 -07:00 · 5d1513c483
commit 5d1513c483
parent 06c1a8cdff 744d239dd7
2 changed files with 76 additions and 7 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -2241,6 +2241,7 @@
 from .tvplayer import TVPlayerIE
 from .tvw import (
    TvwIE,
    TvwNewsIE,
    TvwTvChannelsIE,
 )
 from .tweakers import TweakersIE
--- a/yt_dlp/extractor/tvw.py
+++ b/yt_dlp/extractor/tvw.py
@ -1,6 +1,6 @@
 import json
-from .common import InfoExtractor
+from .common import ExtractorError, HTTPError, InfoExtractor
 from ..utils import (
    clean_html,
    extract_attributes,
@ -10,12 +10,31 @@
    unified_timestamp,
    url_or_none,
 )
-from ..utils.traversal import find_element, traverse_obj
+from ..utils.traversal import find_element, find_elements, traverse_obj
-class TvwIE(InfoExtractor):
+class TvwBaseIE(InfoExtractor):
    def _download_tvw_webpage(self, url, video_id):
        try:
            return self._download_webpage(url, video_id, headers={
                # yt-dlp's default user-agents are too old and blocked by cloudflare
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
            })
        except ExtractorError as e:
            if not isinstance(e.cause, HTTPError) or e.cause.status != 403:
                raise
            self.report_warning('Got HTTP Error 403, retrying')
        # Retry with impersonation if hardcoded UA is insufficient to bypass cloudflare
        return self._download_webpage(url, video_id, impersonate=True)
 class TvwIE(TvwBaseIE):
    IE_NAME = 'tvw'
-    _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P<id>[^/?#]+)'
+    _VALID_URL = [
        r'https?://(?:www\.)?tvw\.org/video/(?P<id>[^/?#]+)',
        r'https?://(?:www\.)?tvw\.org/watch/?\?(?:[^#]+&)?eventID=(?P<id>\d+)',
    ]
    _TESTS = [{
        'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/',
        'md5': '9ceb94fe2bb7fd726f74f16356825703',
@ -75,11 +94,25 @@ class TvwIE(InfoExtractor):
            'display_id': 'washington-to-washington-a-new-space-race-2022041111',
            'categories': ['Washington to Washington', 'General Interest'],
        },
    }, {
        'url': 'https://tvw.org/watch?eventID=2025041235',
        'md5': '7d697c02f110b37d6a47622ea608ca90',
        'info_dict': {
            'id': '2025041235',
            'ext': 'mp4',
            'title': 'Legislative Review -- April 18',
            'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$',
            'description': 'Legislative Review features highlights from Friday\'s legislative activity (4/18/25).',
            'timestamp': 1745006400,
            'upload_date': '20250418',
            'location': 'Hayner Media Center',
            'categories': ['Legislative Review'],
        },
    }]
    def _real_extract(self, url):
        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
+        webpage = self._download_tvw_webpage(url, display_id)
        client_id = self._html_search_meta('clientID', webpage, fatal=True)
        video_id = self._html_search_meta('eventID', webpage, fatal=True)
@ -125,7 +158,42 @@ def _real_extract(self, url):
        }
-class TvwTvChannelsIE(InfoExtractor):
+class TvwNewsIE(TvwBaseIE):
    IE_NAME = 'tvw:news'
    _VALID_URL = r'https?://(?:www\.)?tvw\.org/\d{4}/\d{2}/(?P<id>[^/?#]+)'
    _TESTS = [{
        'url': 'https://tvw.org/2024/01/the-impact-issues-to-watch-in-the-2024-legislative-session/',
        'info_dict': {
            'id': 'the-impact-issues-to-watch-in-the-2024-legislative-session',
            'title': 'The Impact - Issues to Watch in the 2024 Legislative Session',
            'description': 'md5:65f0b33ec8f18ff1cd401c5547aa5441',
        },
        'playlist_count': 6,
    }, {
        'url': 'https://tvw.org/2024/06/the-impact-water-rights-and-the-skookumchuck-dam-debate/',
        'info_dict': {
            'id': 'the-impact-water-rights-and-the-skookumchuck-dam-debate',
            'title': 'The Impact - Water Rights and the Skookumchuck Dam Debate',
            'description': 'md5:185f3a2350ef81e3fa159ac3e040a94b',
        },
        'playlist_count': 1,
    }]
    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        webpage = self._download_tvw_webpage(url, playlist_id)
        video_ids = traverse_obj(webpage, (
            {find_elements(cls='invintus-player', html=True)}, ..., {extract_attributes}, 'data-eventid'))
        return self.playlist_from_matches(
            video_ids, playlist_id,
            playlist_title=remove_end(self._og_search_title(webpage, default=None), ' - TVW'),
            playlist_description=self._og_search_description(webpage, default=None),
            getter=lambda x: f'https://tvw.org/watch?eventID={x}', ie=TvwIE)
 class TvwTvChannelsIE(TvwBaseIE):
    IE_NAME = 'tvw:tvchannels'
    _VALID_URL = r'https?://(?:www\.)?tvw\.org/tvchannels/(?P<id>[^/?#]+)'
    _TESTS = [{
@ -150,7 +218,7 @@ class TvwTvChannelsIE(InfoExtractor):
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_tvw_webpage(url, video_id)
        m3u8_url = traverse_obj(webpage, (
            {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes},