From d6f717c1e8ebf208ca1c6c29f8fa69e290825748 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Thu, 19 Jun 2025 23:20:12 +0300 Subject: [PATCH 1/8] [thehighwire.com] Create exctractor's file --- yt_dlp/extractor/thehighwire.py | 42 +++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 yt_dlp/extractor/thehighwire.py diff --git a/yt_dlp/extractor/thehighwire.py b/yt_dlp/extractor/thehighwire.py new file mode 100644 index 000000000..ffd86f91e --- /dev/null +++ b/yt_dlp/extractor/thehighwire.py @@ -0,0 +1,42 @@ +from .common import InfoExtractor + +class TheHighWireIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thehighwire\.com/ark-videos/(?P[^/?#]+)' + _EMBED_URL = 'https://app.arkengine.com/embed/{id}' + _TESTS = [{ + 'url': 'https://thehighwire.com/ark-videos/the-deposition-of-stanley-plotkin/', + 'info_dict': { + 'id': 'clllgcra301z4ik01x8cwhfu2', + 'title': 'THE DEPOSITION OF STANLEY PLOTKIN', + 'ext': 'mp4', + }, + 'params': {'skip_download': True}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + iframe_url = self._search_regex( + r']+src=["\'](https?://app\.arkengine\.com/embed/[^"\']+)', + webpage, 'iframe URL') + video_id = self._search_regex( + r'embed/([a-zA-Z0-9]+)', iframe_url, 'video ID') + + player_page = self._download_webpage( + self._EMBED_URL.format(id=video_id), video_id, + note='Downloading player page') + + m3u8_url = self._search_regex( + r']+src=["\']([^"\']+\.m3u8)', + player_page, 'm3u8 URL') + + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'og:title', webpage, 'title', default=video_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + } From f9f0c63fbce9ad72e2e369a745d811a83b7519a2 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Thu, 19 Jun 2025 23:45:35 +0300 Subject: [PATCH 2/8] Added TheHighWireIE exctrator to _extractors.py --- yt_dlp/extractor/_extractors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 34c98b537..74d471058 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2100,6 +2100,7 @@ TheGuardianPodcastIE, TheGuardianPodcastPlaylistIE, ) +from .thehighwire import TheHighWireIE from .theholetv import TheHoleTvIE from .theintercept import TheInterceptIE from .theplatform import ( From 4ffba17e1a750d6be030ee4ea7b9d132f2ca0e74 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:02:43 +0300 Subject: [PATCH 3/8] A simpler and more compact way to get embed_url and embed_page at yt_dlp/extractor/thehighwire.py Co-authored-by: doe1080 <98906116+doe1080@users.noreply.github.com> --- yt_dlp/extractor/thehighwire.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/thehighwire.py b/yt_dlp/extractor/thehighwire.py index ffd86f91e..9691910f8 100644 --- a/yt_dlp/extractor/thehighwire.py +++ b/yt_dlp/extractor/thehighwire.py @@ -17,15 +17,10 @@ def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - iframe_url = self._search_regex( - r']+src=["\'](https?://app\.arkengine\.com/embed/[^"\']+)', - webpage, 'iframe URL') - video_id = self._search_regex( - r'embed/([a-zA-Z0-9]+)', iframe_url, 'video ID') - - player_page = self._download_webpage( - self._EMBED_URL.format(id=video_id), video_id, - note='Downloading player page') + embed_url = traverse_obj(webpage, ( + {find_element(cls='ark-video-embed', html=True)}, + {extract_attributes}, 'src', {url_or_none}, {require('embed URL')})) + embed_page = self._download_webpage(embed_url, display_id) m3u8_url = self._search_regex( r']+src=["\']([^"\']+\.m3u8)', From 41a8d75c0ae652851b66437ca51f27dc44090c1b Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:04:22 +0300 Subject: [PATCH 4/8] Use _parse_html5_media_entries for a more convenient and compact way of parsing html page at yt_dlp/extractor/thehighwire.py Co-authored-by: doe1080 <98906116+doe1080@users.noreply.github.com> --- yt_dlp/extractor/thehighwire.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/thehighwire.py b/yt_dlp/extractor/thehighwire.py index 9691910f8..d1346a894 100644 --- a/yt_dlp/extractor/thehighwire.py +++ b/yt_dlp/extractor/thehighwire.py @@ -22,16 +22,12 @@ def _real_extract(self, url): {extract_attributes}, 'src', {url_or_none}, {require('embed URL')})) embed_page = self._download_webpage(embed_url, display_id) - m3u8_url = self._search_regex( - r']+src=["\']([^"\']+\.m3u8)', - player_page, 'm3u8 URL') - - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'og:title', webpage, 'title', default=video_id) - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + 'id': display_id, + **traverse_obj(webpage, { + 'title': ({find_element(cls='section-header')}, {clean_html}), + 'description': ({find_element(cls='episode-description__copy')}, {clean_html}), + }), + **self._parse_html5_media_entries(embed_url, embed_page, display_id, m3u8_id='hls')[0], } + From bb2fc0b4940ce1b5ac1d2b8639fc52af6644039c Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:06:13 +0300 Subject: [PATCH 5/8] Added some utils at yt_dlp/extractor/thehighwire.py Co-authored-by: doe1080 <98906116+doe1080@users.noreply.github.com> --- yt_dlp/extractor/thehighwire.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/yt_dlp/extractor/thehighwire.py b/yt_dlp/extractor/thehighwire.py index d1346a894..8575b0c23 100644 --- a/yt_dlp/extractor/thehighwire.py +++ b/yt_dlp/extractor/thehighwire.py @@ -1,4 +1,15 @@ from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + url_or_none, +) +from ..utils.traversal import ( + find_element, + require, + traverse_obj, +) + class TheHighWireIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thehighwire\.com/ark-videos/(?P[^/?#]+)' From f2b38d7c38ff8dc3a4e5de7b962357d8b977d579 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:07:13 +0300 Subject: [PATCH 6/8] Removed unneeded _EMBED_URL from yt_dlp/extractor/thehighwire.py Co-authored-by: doe1080 <98906116+doe1080@users.noreply.github.com> --- yt_dlp/extractor/thehighwire.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yt_dlp/extractor/thehighwire.py b/yt_dlp/extractor/thehighwire.py index 8575b0c23..6210af9b5 100644 --- a/yt_dlp/extractor/thehighwire.py +++ b/yt_dlp/extractor/thehighwire.py @@ -13,7 +13,6 @@ class TheHighWireIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thehighwire\.com/ark-videos/(?P[^/?#]+)' - _EMBED_URL = 'https://app.arkengine.com/embed/{id}' _TESTS = [{ 'url': 'https://thehighwire.com/ark-videos/the-deposition-of-stanley-plotkin/', 'info_dict': { From 156a0a26a652c2870d717dda531aa4703c4e4bb7 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:08:00 +0300 Subject: [PATCH 7/8] Fome fixes for _TESTS at yt_dlp/extractor/thehighwire.py Co-authored-by: doe1080 <98906116+doe1080@users.noreply.github.com> --- yt_dlp/extractor/thehighwire.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/thehighwire.py b/yt_dlp/extractor/thehighwire.py index 6210af9b5..009e747e6 100644 --- a/yt_dlp/extractor/thehighwire.py +++ b/yt_dlp/extractor/thehighwire.py @@ -16,11 +16,12 @@ class TheHighWireIE(InfoExtractor): _TESTS = [{ 'url': 'https://thehighwire.com/ark-videos/the-deposition-of-stanley-plotkin/', 'info_dict': { - 'id': 'clllgcra301z4ik01x8cwhfu2', - 'title': 'THE DEPOSITION OF STANLEY PLOTKIN', + 'id': 'the-deposition-of-stanley-plotkin', 'ext': 'mp4', + 'title': 'THE DEPOSITION OF STANLEY PLOTKIN', + 'description': 'md5:6d0be4f1181daaa10430fd8b945a5e54', + 'thumbnail': r're:https?://static\.arkengine\.com/video/.+\.jpg', }, - 'params': {'skip_download': True}, }] def _real_extract(self, url): From 5cbade42570706ba032173d0e893740a128ed9d7 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Sat, 21 Jun 2025 00:54:39 +0300 Subject: [PATCH 8/8] Remove extra blank line at yt_dlp/extractor/thehighwire.py Co-authored-by: doe1080 <98906116+doe1080@users.noreply.github.com> --- yt_dlp/extractor/thehighwire.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yt_dlp/extractor/thehighwire.py b/yt_dlp/extractor/thehighwire.py index 009e747e6..8b596143f 100644 --- a/yt_dlp/extractor/thehighwire.py +++ b/yt_dlp/extractor/thehighwire.py @@ -41,4 +41,3 @@ def _real_extract(self, url): }), **self._parse_html5_media_entries(embed_url, embed_page, display_id, m3u8_id='hls')[0], } -