From 9615ae99c0487d8bf8484d657f52a891cfe0a84f Mon Sep 17 00:00:00 2001
From: Fries <fries12420@gmail.com>
Date: Sun, 13 Apr 2025 20:27:40 -0700
Subject: [PATCH 1/4] [ie/tvw:News] Add extractor

---
 yt_dlp/extractor/_extractors.py |  2 +-
 yt_dlp/extractor/tvw.py         | 59 ++++++++++++++++++++++++++++++---
 2 files changed, 55 insertions(+), 6 deletions(-)
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
index f7e3f25c3..e8e0fdcdd 100644
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -2237,7 +2237,7 @@
     TVPlayIE,
 )
 from .tvplayer import TVPlayerIE
-from .tvw import TvwIE
+from .tvw import TvwIE, TvwNewsIE
 from .tweakers import TweakersIE
 from .twentymin import TwentyMinutenIE
 from .twentythreevideo import TwentyThreeVideoIE
diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py
index 1c060cd7a..6f924d7b1 100644
--- a/yt_dlp/extractor/tvw.py
+++ b/yt_dlp/extractor/tvw.py
@@ -1,11 +1,19 @@
 import json
 
 from .common import InfoExtractor
-from ..utils import clean_html, remove_end, unified_timestamp, url_or_none
-from ..utils.traversal import traverse_obj
+from ..utils import clean_html, extract_attributes, remove_end, unified_timestamp, url_or_none
+from ..utils.traversal import find_elements, traverse_obj
 
 
-class TvwIE(InfoExtractor):
+class TvwBaseIE(InfoExtractor):
+    def _get_title(self, webpage):
+        return remove_end(self._og_search_title(webpage, default=None), ' - TVW')
+
+    def _get_description(self, webpage):
+        return self._og_search_description(webpage, default=None)
+
+
+class TvwIE(TvwBaseIE):
     _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P<id>[^/?#]+)'
 
     _TESTS = [{
@@ -103,8 +111,8 @@ def _real_extract(self, url):
             'display_id': display_id,
             'formats': formats,
             'subtitles': subtitles,
-            'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'),
-            'description': self._og_search_description(webpage, default=None),
+            'title': self._get_title(webpage),
+            'description': self._get_description(webpage),
             **traverse_obj(video_data, {
                 'title': ('title', {str}),
                 'description': ('description', {clean_html}),
@@ -115,3 +123,44 @@ def _real_extract(self, url):
                 'is_live': ('eventStatus', {lambda x: x == 'live'}),
             }),
         }
+
+
+class TvwNewsIE(TvwBaseIE):
+    IE_NAME = 'Tvw:News'
+    _VALID_URL = r'https?://(?:www\.)?tvw\.org/(\d{4})/(0[1-9]|1[0-2])/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://tvw.org/2024/01/the-impact-issues-to-watch-in-the-2024-legislative-session/',
+        'info_dict': {
+            'id': 'the-impact-issues-to-watch-in-the-2024-legislative-session',
+            'title': 'The Impact - Issues to Watch in the 2024 Legislative Session',
+            'description': 'md5:65f0b33ec8f18ff1cd401c5547aa5441',
+        },
+        'playlist_count': 6,
+    }, {
+        'url': 'https://tvw.org/2024/06/the-impact-water-rights-and-the-skookumchuck-dam-debate/',
+        'info_dict': {
+            'id': 'the-impact-water-rights-and-the-skookumchuck-dam-debate',
+            'title': 'The Impact - Water Rights and the Skookumchuck Dam Debate',
+            'description': 'md5:185f3a2350ef81e3fa159ac3e040a94b',
+        },
+        'playlist_count': 1,
+    }, {
+        'url': 'https://tvw.org/2023/09/5th-annual-tvw-open-thank-you/',
+        'info_dict': {
+            'id': '5th-annual-tvw-open-thank-you',
+            'title': '5th Annual TVW Open THANK YOU!',
+            'description': 'md5:5306eef5b03c87108797cb6261c5f16c',
+        },
+        'playlist_count': 0,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+
+        video_ids = traverse_obj(webpage, (
+            {find_elements(cls='invintus-player', html=True)}, ..., {extract_attributes}, 'data-eventid'))
+
+        return self.playlist_result(
+            (self.url_result(f'https://tvw.org/watch?eventID={video_id}') for video_id in video_ids), playlist_id,
+            playlist_title=self._get_title(webpage), playlist_description=self._get_description(webpage))

From 76d1c63d443ee3732686ce44a4a5428db7c59965 Mon Sep 17 00:00:00 2001
From: Fries <fries12420@gmail.com>
Date: Fri, 18 Apr 2025 22:57:43 -0700
Subject: [PATCH 2/4] [ie/tvw] Make some updates from suggestions

---
 yt_dlp/extractor/tvw.py | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py
index 89556cf98..9698f03f2 100644
--- a/yt_dlp/extractor/tvw.py
+++ b/yt_dlp/extractor/tvw.py
@@ -1,13 +1,21 @@
 import json
 
 from .common import InfoExtractor
-from ..utils import clean_html, extract_attributes, parse_qs, remove_end, require, unified_timestamp, url_or_none
+from ..utils import (
+    clean_html,
+    extract_attributes,
+    parse_qs,
+    remove_end,
+    require,
+    unified_timestamp,
+    url_or_none,
+)
 from ..utils.traversal import find_element, find_elements, traverse_obj
 
 
 class TvwIE(InfoExtractor):
     IE_NAME = 'tvw'
-    _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?tvw\.org/(?:video|watch)/?(?:\?eventID=)?(?P<id>[^/?#]+)'
     _TESTS = [{
         'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/',
         'md5': '9ceb94fe2bb7fd726f74f16356825703',
@@ -71,7 +79,10 @@ class TvwIE(InfoExtractor):
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
+        # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge
+        webpage = self._download_webpage(url, display_id, headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
+        })
 
         client_id = self._html_search_meta('clientID', webpage, fatal=True)
         video_id = self._html_search_meta('eventID', webpage, fatal=True)
@@ -118,8 +129,8 @@ def _real_extract(self, url):
 
 
 class TvwNewsIE(InfoExtractor):
-    IE_NAME = 'Tvw:News'
-    _VALID_URL = r'https?://(?:www\.)?tvw\.org/(\d{4})/(0[1-9]|1[0-2])/(?P<id>[^/?#]+)'
+    IE_NAME = 'tvw:News'
+    _VALID_URL = r'https?://(?:www\.)?tvw\.org/\d{4}/\d{2}/(?P<id>[^/?#]+)'
     _TESTS = [{
         'url': 'https://tvw.org/2024/01/the-impact-issues-to-watch-in-the-2024-legislative-session/',
         'info_dict': {
@@ -148,14 +159,18 @@ class TvwNewsIE(InfoExtractor):
 
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
-        webpage = self._download_webpage(url, playlist_id)
+        # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge
+        webpage = self._download_webpage(url, playlist_id, headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
+        })
 
         video_ids = traverse_obj(webpage, (
             {find_elements(cls='invintus-player', html=True)}, ..., {extract_attributes}, 'data-eventid'))
 
-        return self.playlist_result(
-            (self.url_result(f'https://tvw.org/watch?eventID={video_id}') for video_id in video_ids), playlist_id,
-            playlist_title=remove_end(self._og_search_title(webpage, default=None), ' - TVW'), playlist_description=self._og_search_description(webpage, default=None))
+        return self.playlist_from_matches(
+            (f'https://tvw.org/watch?eventID={video_id}' for video_id in video_ids), playlist_id,
+            playlist_title=remove_end(self._og_search_title(webpage, default=None), ' - TVW'),
+            playlist_description=self._og_search_description(webpage, default=None), ie=TvwIE)
 
 
 class TvwTvChannelsIE(InfoExtractor):
@@ -183,7 +198,10 @@ class TvwTvChannelsIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge
+        webpage = self._download_webpage(url, video_id, headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
+        })
 
         m3u8_url = traverse_obj(webpage, (
             {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes},

From 8032ad0af53dc3edcba7f6ecdd8976fb6902b7be Mon Sep 17 00:00:00 2001
From: Fries <fries12420@gmail.com>
Date: Fri, 18 Apr 2025 23:06:20 -0700
Subject: [PATCH 3/4] [ie/tvw] Add a test for a watch? link

---
 yt_dlp/extractor/tvw.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py
index 9698f03f2..29e8ab5b5 100644
--- a/yt_dlp/extractor/tvw.py
+++ b/yt_dlp/extractor/tvw.py
@@ -75,6 +75,20 @@ class TvwIE(InfoExtractor):
             'display_id': 'washington-to-washington-a-new-space-race-2022041111',
             'categories': ['Washington to Washington', 'General Interest'],
         },
+    }, {
+        'url': 'https://tvw.org/watch?eventID=2025041235',
+        'md5': '7d697c02f110b37d6a47622ea608ca90',
+        'info_dict': {
+            'id': '2025041235',
+            'ext': 'mp4',
+            'title': 'Legislative Review -- April 18',
+            'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$',
+            'description': 'Legislative Review features highlights from Friday\'s legislative activity (4/18/25).',
+            'timestamp': 1745006400,
+            'upload_date': '20250418',
+            'location': 'Hayner Media Center',
+            'categories': ['Legislative Review'],
+        },
     }]
 
     def _real_extract(self, url):

From 744d239dd70a4a3e8be6884fecabe50bee47867f Mon Sep 17 00:00:00 2001
From: Fries <fries12420@gmail.com>
Date: Sun, 11 May 2025 16:03:12 -0700
Subject: [PATCH 4/4] [ie/tvw] Make updates from suggestions

---
 yt_dlp/extractor/tvw.py | 59 ++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py
index 29e8ab5b5..6cd3c715d 100644
--- a/yt_dlp/extractor/tvw.py
+++ b/yt_dlp/extractor/tvw.py
@@ -1,6 +1,6 @@
 import json
 
-from .common import InfoExtractor
+from .common import ExtractorError, HTTPError, InfoExtractor
 from ..utils import (
     clean_html,
     extract_attributes,
@@ -13,9 +13,28 @@
 from ..utils.traversal import find_element, find_elements, traverse_obj
 
 
-class TvwIE(InfoExtractor):
+class TvwBaseIE(InfoExtractor):
+    def _download_tvw_webpage(self, url, video_id):
+        try:
+            return self._download_webpage(url, video_id, headers={
+                # yt-dlp's default user-agents are too old and blocked by cloudflare
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
+            })
+        except ExtractorError as e:
+            if not isinstance(e.cause, HTTPError) or e.cause.status != 403:
+                raise
+            self.report_warning('Got HTTP Error 403, retrying')
+
+        # Retry with impersonation if hardcoded UA is insufficient to bypass cloudflare
+        return self._download_webpage(url, video_id, impersonate=True)
+
+
+class TvwIE(TvwBaseIE):
     IE_NAME = 'tvw'
-    _VALID_URL = r'https?://(?:www\.)?tvw\.org/(?:video|watch)/?(?:\?eventID=)?(?P<id>[^/?#]+)'
+    _VALID_URL = [
+        r'https?://(?:www\.)?tvw\.org/video/(?P<id>[^/?#]+)',
+        r'https?://(?:www\.)?tvw\.org/watch/?\?(?:[^#]+&)?eventID=(?P<id>\d+)',
+    ]
     _TESTS = [{
         'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/',
         'md5': '9ceb94fe2bb7fd726f74f16356825703',
@@ -93,10 +112,7 @@ class TvwIE(InfoExtractor):
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
-        # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge
-        webpage = self._download_webpage(url, display_id, headers={
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
-        })
+        webpage = self._download_tvw_webpage(url, display_id)
 
         client_id = self._html_search_meta('clientID', webpage, fatal=True)
         video_id = self._html_search_meta('eventID', webpage, fatal=True)
@@ -142,8 +158,8 @@ def _real_extract(self, url):
         }
 
 
-class TvwNewsIE(InfoExtractor):
-    IE_NAME = 'tvw:News'
+class TvwNewsIE(TvwBaseIE):
+    IE_NAME = 'tvw:news'
     _VALID_URL = r'https?://(?:www\.)?tvw\.org/\d{4}/\d{2}/(?P<id>[^/?#]+)'
     _TESTS = [{
         'url': 'https://tvw.org/2024/01/the-impact-issues-to-watch-in-the-2024-legislative-session/',
@@ -161,33 +177,23 @@ class TvwNewsIE(InfoExtractor):
             'description': 'md5:185f3a2350ef81e3fa159ac3e040a94b',
         },
         'playlist_count': 1,
-    }, {
-        'url': 'https://tvw.org/2023/09/5th-annual-tvw-open-thank-you/',
-        'info_dict': {
-            'id': '5th-annual-tvw-open-thank-you',
-            'title': '5th Annual TVW Open THANK YOU!',
-            'description': 'md5:5306eef5b03c87108797cb6261c5f16c',
-        },
-        'playlist_count': 0,
     }]
 
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
-        # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge
-        webpage = self._download_webpage(url, playlist_id, headers={
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
-        })
+        webpage = self._download_tvw_webpage(url, playlist_id)
 
         video_ids = traverse_obj(webpage, (
             {find_elements(cls='invintus-player', html=True)}, ..., {extract_attributes}, 'data-eventid'))
 
         return self.playlist_from_matches(
-            (f'https://tvw.org/watch?eventID={video_id}' for video_id in video_ids), playlist_id,
+            video_ids, playlist_id,
             playlist_title=remove_end(self._og_search_title(webpage, default=None), ' - TVW'),
-            playlist_description=self._og_search_description(webpage, default=None), ie=TvwIE)
+            playlist_description=self._og_search_description(webpage, default=None),
+            getter=lambda x: f'https://tvw.org/watch?eventID={x}', ie=TvwIE)
 
 
-class TvwTvChannelsIE(InfoExtractor):
+class TvwTvChannelsIE(TvwBaseIE):
     IE_NAME = 'tvw:tvchannels'
     _VALID_URL = r'https?://(?:www\.)?tvw\.org/tvchannels/(?P<id>[^/?#]+)'
     _TESTS = [{
@@ -212,10 +218,7 @@ class TvwTvChannelsIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge
-        webpage = self._download_webpage(url, video_id, headers={
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
-        })
+        webpage = self._download_tvw_webpage(url, video_id)
 
         m3u8_url = traverse_obj(webpage, (
             {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes},