1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-07-12 16:28:31 +00:00

fix: custom regexs -> yt-dlp html helpers (for reliability)

This commit is contained in:
DarkCat09 2025-04-07 13:51:39 +04:00
parent 70e0c591be
commit cdbfe3a793
No known key found for this signature in database

View File

@ -13,6 +13,7 @@
ExtractorError, ExtractorError,
UserNotLive, UserNotLive,
clean_html, clean_html,
extract_attributes,
get_element_by_class, get_element_by_class,
get_element_html_by_class, get_element_html_by_class,
get_element_html_by_id, get_element_html_by_id,
@ -1083,6 +1084,7 @@ def _real_extract(self, url):
playlist_id) playlist_id)
del hash_in_url del hash_in_url
# to remove big scripts and other elements not used by parser
html = get_element_html_by_class('AudioPlaylistSnippet', webpage) html = get_element_html_by_class('AudioPlaylistSnippet', webpage)
del webpage del webpage
@ -1105,29 +1107,29 @@ def _real_extract(self, url):
entries.append(self.url_result( entries.append(self.url_result(
audio_url, VKMusicTrackIE, track_id, title, **info)) audio_url, VKMusicTrackIE, track_id, title, **info))
title = self._html_search_regex( header = get_element_html_by_class('AudioPlaylistSnippet__header', html)
r'class="[^"]*AudioPlaylistSnippet__title--main[^"]*"[^>]*>([^<]+)',
html, 'playlist title', fatal=False, group=1)
artist = self._html_search_regex( title = clean_html(get_element_by_class('AudioPlaylistSnippet__title', header))
r'class="[^"]*AudioPlaylistSnippet__author[^"]*"[^>]*>\s*<a(?:\s[^>]*)?>([^<]+)', artist = clean_html(get_element_by_class('AudioPlaylistSnippet__author', header))
html, 'playlist author', fatal=False, group=1)
description = clean_html(get_element_by_class( info_text = clean_html(get_element_by_class('AudioPlaylistSnippet__info', header))
'AudioPlaylistSnippet__description', html)) info_sep = info_text.find('·')
# description = self._html_search_regex(
# r'div\s[^>]*class="[^"]*AudioPlaylistSnippet__description[^"]*">??????',
# html, 'playlist description', fatal=False, group=1)
genre, year = self._html_search_regex(
r'class="[^"]*AudioPlaylistSnippet__info[^"]*"[^>]*>\s*(.+)&nbsp;.*;(\d+)\s*</',
html, 'genre and release year', default=(None, None), group=(1, 2))
year = int_or_none(info_text[info_sep + 1:]) if info_sep != -1 else None
is_album = year is not None is_album = year is not None
genre = info_text[:info_sep].rstrip() if is_album else None
thumbnail = url_or_none(self._html_search_regex( del header
r'class="[^"]*AudioPlaylistSnippet__cover[^"]*"[^>]*style="background-image\s*:\s*url\s*\(\s*\'([^\']+)',
html, 'playlist thumbnail', fatal=False, group=1)) description = clean_html(get_element_by_class('AudioPlaylistSnippet__description', html))
thumbnail = url_or_none(self._search_regex(
r'background[^:;]*:\s*url\s*\(\s*\'([^\']+)',
extract_attributes(
get_element_html_by_class(
'AudioPlaylistSnippet__cover',
html)).get('style'),
'playlist thumbnail', fatal=False, group=1))
return self.playlist_result( return self.playlist_result(
entries, playlist_id, entries, playlist_id,
@ -1138,7 +1140,7 @@ def _real_extract(self, url):
artists=[artist] if is_album else None, artists=[artist] if is_album else None,
thumbnails=[{'url': thumbnail}] if thumbnail else [], thumbnails=[{'url': thumbnail}] if thumbnail else [],
genres=[genre] if genre else None, genres=[genre] if genre else None,
release_year=int_or_none(year)) release_year=year)
class VKPlayBaseIE(InfoExtractor): class VKPlayBaseIE(InfoExtractor):