fix: custom regexs -> yt-dlp html helpers (for reliability)

2025-07-12 16:28:31 +00:00 · 2025-04-07 13:51:39 +04:00 · 2025-04-07 13:51:39 +04:00 · cdbfe3a793
commit cdbfe3a793
parent 70e0c591be
1 changed files with 21 additions and 19 deletions
--- a/yt_dlp/extractor/vk.py
+++ b/yt_dlp/extractor/vk.py
@ -13,6 +13,7 @@
    ExtractorError,
    UserNotLive,
    clean_html,
    extract_attributes,
    get_element_by_class,
    get_element_html_by_class,
    get_element_html_by_id,
@ -1083,6 +1084,7 @@ def _real_extract(self, url):
            playlist_id)
        del hash_in_url
        # to remove big scripts and other elements not used by parser
        html = get_element_html_by_class('AudioPlaylistSnippet', webpage)
        del webpage
@ -1105,29 +1107,29 @@ def _real_extract(self, url):
            entries.append(self.url_result(
                audio_url, VKMusicTrackIE, track_id, title, **info))
-        title = self._html_search_regex(
+        header = get_element_html_by_class('AudioPlaylistSnippet__header', html)
            r'class="[^"]*AudioPlaylistSnippet__title--main[^"]*"[^>]*>([^<]+)',
            html, 'playlist title', fatal=False, group=1)
-        artist = self._html_search_regex(
+        title = clean_html(get_element_by_class('AudioPlaylistSnippet__title', header))
-            r'class="[^"]*AudioPlaylistSnippet__author[^"]*"[^>]*>\s*<a(?:\s[^>]*)?>([^<]+)',
+        artist = clean_html(get_element_by_class('AudioPlaylistSnippet__author', header))
            html, 'playlist author', fatal=False, group=1)
-        description = clean_html(get_element_by_class(
+        info_text = clean_html(get_element_by_class('AudioPlaylistSnippet__info', header))
-            'AudioPlaylistSnippet__description', html))
+        info_sep = info_text.find('·')
        # description = self._html_search_regex(
        #     r'div\s[^>]*class="[^"]*AudioPlaylistSnippet__description[^"]*">??????',
        #     html, 'playlist description', fatal=False, group=1)
        genre, year = self._html_search_regex(
            r'class="[^"]*AudioPlaylistSnippet__info[^"]*"[^>]*>\s*(.+)&nbsp;.*;(\d+)\s*</',
            html, 'genre and release year', default=(None, None), group=(1, 2))
        year = int_or_none(info_text[info_sep + 1:]) if info_sep != -1 else None
        is_album = year is not None
        genre = info_text[:info_sep].rstrip() if is_album else None
-        thumbnail = url_or_none(self._html_search_regex(
+        del header
-            r'class="[^"]*AudioPlaylistSnippet__cover[^"]*"[^>]*style="background-image\s*:\s*url\s*\(\s*\'([^\']+)',
+
-            html, 'playlist thumbnail', fatal=False, group=1))
+        description = clean_html(get_element_by_class('AudioPlaylistSnippet__description', html))
        thumbnail = url_or_none(self._search_regex(
            r'background[^:;]*:\s*url\s*\(\s*\'([^\']+)',
            extract_attributes(
                get_element_html_by_class(
                    'AudioPlaylistSnippet__cover',
                    html)).get('style'),
            'playlist thumbnail', fatal=False, group=1))
        return self.playlist_result(
            entries, playlist_id,
@ -1138,7 +1140,7 @@ def _real_extract(self, url):
            artists=[artist] if is_album else None,
            thumbnails=[{'url': thumbnail}] if thumbnail else [],
            genres=[genre] if genre else None,
-            release_year=int_or_none(year))
+            release_year=year)
 class VKPlayBaseIE(InfoExtractor):