From 9615ae99c0487d8bf8484d657f52a891cfe0a84f Mon Sep 17 00:00:00 2001 From: Fries Date: Sun, 13 Apr 2025 20:27:40 -0700 Subject: [PATCH 1/4] [ie/tvw:News] Add extractor --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/tvw.py | 59 ++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f7e3f25c3..e8e0fdcdd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2237,7 +2237,7 @@ TVPlayIE, ) from .tvplayer import TVPlayerIE -from .tvw import TvwIE +from .tvw import TvwIE, TvwNewsIE from .tweakers import TweakersIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py index 1c060cd7a..6f924d7b1 100644 --- a/yt_dlp/extractor/tvw.py +++ b/yt_dlp/extractor/tvw.py @@ -1,11 +1,19 @@ import json from .common import InfoExtractor -from ..utils import clean_html, remove_end, unified_timestamp, url_or_none -from ..utils.traversal import traverse_obj +from ..utils import clean_html, extract_attributes, remove_end, unified_timestamp, url_or_none +from ..utils.traversal import find_elements, traverse_obj -class TvwIE(InfoExtractor): +class TvwBaseIE(InfoExtractor): + def _get_title(self, webpage): + return remove_end(self._og_search_title(webpage, default=None), ' - TVW') + + def _get_description(self, webpage): + return self._og_search_description(webpage, default=None) + + +class TvwIE(TvwBaseIE): _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P[^/?#]+)' _TESTS = [{ @@ -103,8 +111,8 @@ def _real_extract(self, url): 'display_id': display_id, 'formats': formats, 'subtitles': subtitles, - 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), - 'description': self._og_search_description(webpage, default=None), + 'title': self._get_title(webpage), + 'description': self._get_description(webpage), **traverse_obj(video_data, { 'title': ('title', {str}), 'description': ('description', {clean_html}), @@ -115,3 +123,44 @@ def _real_extract(self, url): 'is_live': ('eventStatus', {lambda x: x == 'live'}), }), } + + +class TvwNewsIE(TvwBaseIE): + IE_NAME = 'Tvw:News' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/(\d{4})/(0[1-9]|1[0-2])/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://tvw.org/2024/01/the-impact-issues-to-watch-in-the-2024-legislative-session/', + 'info_dict': { + 'id': 'the-impact-issues-to-watch-in-the-2024-legislative-session', + 'title': 'The Impact - Issues to Watch in the 2024 Legislative Session', + 'description': 'md5:65f0b33ec8f18ff1cd401c5547aa5441', + }, + 'playlist_count': 6, + }, { + 'url': 'https://tvw.org/2024/06/the-impact-water-rights-and-the-skookumchuck-dam-debate/', + 'info_dict': { + 'id': 'the-impact-water-rights-and-the-skookumchuck-dam-debate', + 'title': 'The Impact - Water Rights and the Skookumchuck Dam Debate', + 'description': 'md5:185f3a2350ef81e3fa159ac3e040a94b', + }, + 'playlist_count': 1, + }, { + 'url': 'https://tvw.org/2023/09/5th-annual-tvw-open-thank-you/', + 'info_dict': { + 'id': '5th-annual-tvw-open-thank-you', + 'title': '5th Annual TVW Open THANK YOU!', + 'description': 'md5:5306eef5b03c87108797cb6261c5f16c', + }, + 'playlist_count': 0, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + video_ids = traverse_obj(webpage, ( + {find_elements(cls='invintus-player', html=True)}, ..., {extract_attributes}, 'data-eventid')) + + return self.playlist_result( + (self.url_result(f'https://tvw.org/watch?eventID={video_id}') for video_id in video_ids), playlist_id, + playlist_title=self._get_title(webpage), playlist_description=self._get_description(webpage)) From 76d1c63d443ee3732686ce44a4a5428db7c59965 Mon Sep 17 00:00:00 2001 From: Fries Date: Fri, 18 Apr 2025 22:57:43 -0700 Subject: [PATCH 2/4] [ie/tvw] Make some updates from suggestions --- yt_dlp/extractor/tvw.py | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py index 89556cf98..9698f03f2 100644 --- a/yt_dlp/extractor/tvw.py +++ b/yt_dlp/extractor/tvw.py @@ -1,13 +1,21 @@ import json from .common import InfoExtractor -from ..utils import clean_html, extract_attributes, parse_qs, remove_end, require, unified_timestamp, url_or_none +from ..utils import ( + clean_html, + extract_attributes, + parse_qs, + remove_end, + require, + unified_timestamp, + url_or_none, +) from ..utils.traversal import find_element, find_elements, traverse_obj class TvwIE(InfoExtractor): IE_NAME = 'tvw' - _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/(?:video|watch)/?(?:\?eventID=)?(?P[^/?#]+)' _TESTS = [{ 'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/', 'md5': '9ceb94fe2bb7fd726f74f16356825703', @@ -71,7 +79,10 @@ class TvwIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge + webpage = self._download_webpage(url, display_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0', + }) client_id = self._html_search_meta('clientID', webpage, fatal=True) video_id = self._html_search_meta('eventID', webpage, fatal=True) @@ -118,8 +129,8 @@ def _real_extract(self, url): class TvwNewsIE(InfoExtractor): - IE_NAME = 'Tvw:News' - _VALID_URL = r'https?://(?:www\.)?tvw\.org/(\d{4})/(0[1-9]|1[0-2])/(?P[^/?#]+)' + IE_NAME = 'tvw:News' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/\d{4}/\d{2}/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://tvw.org/2024/01/the-impact-issues-to-watch-in-the-2024-legislative-session/', 'info_dict': { @@ -148,14 +159,18 @@ class TvwNewsIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge + webpage = self._download_webpage(url, playlist_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0', + }) video_ids = traverse_obj(webpage, ( {find_elements(cls='invintus-player', html=True)}, ..., {extract_attributes}, 'data-eventid')) - return self.playlist_result( - (self.url_result(f'https://tvw.org/watch?eventID={video_id}') for video_id in video_ids), playlist_id, - playlist_title=remove_end(self._og_search_title(webpage, default=None), ' - TVW'), playlist_description=self._og_search_description(webpage, default=None)) + return self.playlist_from_matches( + (f'https://tvw.org/watch?eventID={video_id}' for video_id in video_ids), playlist_id, + playlist_title=remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + playlist_description=self._og_search_description(webpage, default=None), ie=TvwIE) class TvwTvChannelsIE(InfoExtractor): @@ -183,7 +198,10 @@ class TvwTvChannelsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge + webpage = self._download_webpage(url, video_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0', + }) m3u8_url = traverse_obj(webpage, ( {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes}, From 8032ad0af53dc3edcba7f6ecdd8976fb6902b7be Mon Sep 17 00:00:00 2001 From: Fries Date: Fri, 18 Apr 2025 23:06:20 -0700 Subject: [PATCH 3/4] [ie/tvw] Add a test for a watch? link --- yt_dlp/extractor/tvw.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py index 9698f03f2..29e8ab5b5 100644 --- a/yt_dlp/extractor/tvw.py +++ b/yt_dlp/extractor/tvw.py @@ -75,6 +75,20 @@ class TvwIE(InfoExtractor): 'display_id': 'washington-to-washington-a-new-space-race-2022041111', 'categories': ['Washington to Washington', 'General Interest'], }, + }, { + 'url': 'https://tvw.org/watch?eventID=2025041235', + 'md5': '7d697c02f110b37d6a47622ea608ca90', + 'info_dict': { + 'id': '2025041235', + 'ext': 'mp4', + 'title': 'Legislative Review -- April 18', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'Legislative Review features highlights from Friday\'s legislative activity (4/18/25).', + 'timestamp': 1745006400, + 'upload_date': '20250418', + 'location': 'Hayner Media Center', + 'categories': ['Legislative Review'], + }, }] def _real_extract(self, url): From 744d239dd70a4a3e8be6884fecabe50bee47867f Mon Sep 17 00:00:00 2001 From: Fries Date: Sun, 11 May 2025 16:03:12 -0700 Subject: [PATCH 4/4] [ie/tvw] Make updates from suggestions --- yt_dlp/extractor/tvw.py | 59 ++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py index 29e8ab5b5..6cd3c715d 100644 --- a/yt_dlp/extractor/tvw.py +++ b/yt_dlp/extractor/tvw.py @@ -1,6 +1,6 @@ import json -from .common import InfoExtractor +from .common import ExtractorError, HTTPError, InfoExtractor from ..utils import ( clean_html, extract_attributes, @@ -13,9 +13,28 @@ from ..utils.traversal import find_element, find_elements, traverse_obj -class TvwIE(InfoExtractor): +class TvwBaseIE(InfoExtractor): + def _download_tvw_webpage(self, url, video_id): + try: + return self._download_webpage(url, video_id, headers={ + # yt-dlp's default user-agents are too old and blocked by cloudflare + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0', + }) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status != 403: + raise + self.report_warning('Got HTTP Error 403, retrying') + + # Retry with impersonation if hardcoded UA is insufficient to bypass cloudflare + return self._download_webpage(url, video_id, impersonate=True) + + +class TvwIE(TvwBaseIE): IE_NAME = 'tvw' - _VALID_URL = r'https?://(?:www\.)?tvw\.org/(?:video|watch)/?(?:\?eventID=)?(?P[^/?#]+)' + _VALID_URL = [ + r'https?://(?:www\.)?tvw\.org/video/(?P[^/?#]+)', + r'https?://(?:www\.)?tvw\.org/watch/?\?(?:[^#]+&)?eventID=(?P\d+)', + ] _TESTS = [{ 'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/', 'md5': '9ceb94fe2bb7fd726f74f16356825703', @@ -93,10 +112,7 @@ class TvwIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge - webpage = self._download_webpage(url, display_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0', - }) + webpage = self._download_tvw_webpage(url, display_id) client_id = self._html_search_meta('clientID', webpage, fatal=True) video_id = self._html_search_meta('eventID', webpage, fatal=True) @@ -142,8 +158,8 @@ def _real_extract(self, url): } -class TvwNewsIE(InfoExtractor): - IE_NAME = 'tvw:News' +class TvwNewsIE(TvwBaseIE): + IE_NAME = 'tvw:news' _VALID_URL = r'https?://(?:www\.)?tvw\.org/\d{4}/\d{2}/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://tvw.org/2024/01/the-impact-issues-to-watch-in-the-2024-legislative-session/', @@ -161,33 +177,23 @@ class TvwNewsIE(InfoExtractor): 'description': 'md5:185f3a2350ef81e3fa159ac3e040a94b', }, 'playlist_count': 1, - }, { - 'url': 'https://tvw.org/2023/09/5th-annual-tvw-open-thank-you/', - 'info_dict': { - 'id': '5th-annual-tvw-open-thank-you', - 'title': '5th Annual TVW Open THANK YOU!', - 'description': 'md5:5306eef5b03c87108797cb6261c5f16c', - }, - 'playlist_count': 0, }] def _real_extract(self, url): playlist_id = self._match_id(url) - # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge - webpage = self._download_webpage(url, playlist_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0', - }) + webpage = self._download_tvw_webpage(url, playlist_id) video_ids = traverse_obj(webpage, ( {find_elements(cls='invintus-player', html=True)}, ..., {extract_attributes}, 'data-eventid')) return self.playlist_from_matches( - (f'https://tvw.org/watch?eventID={video_id}' for video_id in video_ids), playlist_id, + video_ids, playlist_id, playlist_title=remove_end(self._og_search_title(webpage, default=None), ' - TVW'), - playlist_description=self._og_search_description(webpage, default=None), ie=TvwIE) + playlist_description=self._og_search_description(webpage, default=None), + getter=lambda x: f'https://tvw.org/watch?eventID={x}', ie=TvwIE) -class TvwTvChannelsIE(InfoExtractor): +class TvwTvChannelsIE(TvwBaseIE): IE_NAME = 'tvw:tvchannels' _VALID_URL = r'https?://(?:www\.)?tvw\.org/tvchannels/(?P[^/?#]+)' _TESTS = [{ @@ -212,10 +218,7 @@ class TvwTvChannelsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # Use a newer user agent as the default yt-dlp one triggers the Cloudflare anti-bot challenge - webpage = self._download_webpage(url, video_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0', - }) + webpage = self._download_tvw_webpage(url, video_id) m3u8_url = traverse_obj(webpage, ( {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes},