From cdbfe3a79311078ec73753763b34570332448f06 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Mon, 7 Apr 2025 13:51:39 +0400 Subject: [PATCH] fix: custom regexs -> yt-dlp html helpers (for reliability) --- yt_dlp/extractor/vk.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index dad19d994..06f7849e5 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -13,6 +13,7 @@ ExtractorError, UserNotLive, clean_html, + extract_attributes, get_element_by_class, get_element_html_by_class, get_element_html_by_id, @@ -1083,6 +1084,7 @@ def _real_extract(self, url): playlist_id) del hash_in_url + # to remove big scripts and other elements not used by parser html = get_element_html_by_class('AudioPlaylistSnippet', webpage) del webpage @@ -1105,29 +1107,29 @@ def _real_extract(self, url): entries.append(self.url_result( audio_url, VKMusicTrackIE, track_id, title, **info)) - title = self._html_search_regex( - r'class="[^"]*AudioPlaylistSnippet__title--main[^"]*"[^>]*>([^<]+)', - html, 'playlist title', fatal=False, group=1) + header = get_element_html_by_class('AudioPlaylistSnippet__header', html) - artist = self._html_search_regex( - r'class="[^"]*AudioPlaylistSnippet__author[^"]*"[^>]*>\s*]*)?>([^<]+)', - html, 'playlist author', fatal=False, group=1) + title = clean_html(get_element_by_class('AudioPlaylistSnippet__title', header)) + artist = clean_html(get_element_by_class('AudioPlaylistSnippet__author', header)) - description = clean_html(get_element_by_class( - 'AudioPlaylistSnippet__description', html)) - # description = self._html_search_regex( - # r'div\s[^>]*class="[^"]*AudioPlaylistSnippet__description[^"]*">??????', - # html, 'playlist description', fatal=False, group=1) - - genre, year = self._html_search_regex( - r'class="[^"]*AudioPlaylistSnippet__info[^"]*"[^>]*>\s*(.+) .*;(\d+)\s*]*style="background-image\s*:\s*url\s*\(\s*\'([^\']+)', - html, 'playlist thumbnail', fatal=False, group=1)) + del header + + description = clean_html(get_element_by_class('AudioPlaylistSnippet__description', html)) + + thumbnail = url_or_none(self._search_regex( + r'background[^:;]*:\s*url\s*\(\s*\'([^\']+)', + extract_attributes( + get_element_html_by_class( + 'AudioPlaylistSnippet__cover', + html)).get('style'), + 'playlist thumbnail', fatal=False, group=1)) return self.playlist_result( entries, playlist_id, @@ -1138,7 +1140,7 @@ def _real_extract(self, url): artists=[artist] if is_album else None, thumbnails=[{'url': thumbnail}] if thumbnail else [], genres=[genre] if genre else None, - release_year=int_or_none(year)) + release_year=year) class VKPlayBaseIE(InfoExtractor):