mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-06-28 09:28:33 +00:00
feat: rewrite playlist parser for html webpage, add test
This commit is contained in:
parent
8df9e488ee
commit
44ed9058e9
@ -14,6 +14,7 @@
|
|||||||
UserNotLive,
|
UserNotLive,
|
||||||
clean_html,
|
clean_html,
|
||||||
get_element_by_class,
|
get_element_by_class,
|
||||||
|
get_element_html_by_class,
|
||||||
get_element_html_by_id,
|
get_element_html_by_id,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
join_nonempty,
|
join_nonempty,
|
||||||
@ -1028,16 +1029,12 @@ class VKMusicPlaylistIE(VKMusicBaseIE):
|
|||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '-2000984503_984503',
|
'id': '-2000984503_984503',
|
||||||
'title': 'Linkin Park - One More Light',
|
'title': 'Linkin Park - One More Light',
|
||||||
'description': '',
|
|
||||||
'album': 'One More Light',
|
'album': 'One More Light',
|
||||||
'uploader': 'Linkin Park',
|
'uploader': 'Linkin Park',
|
||||||
'artists': ['Linkin Park'],
|
'artists': ['Linkin Park'],
|
||||||
'thumbnails': [{'url': r're:https?://.*\.jpg'}],
|
'thumbnails': [{'url': r're:https?://.*\.jpg'}],
|
||||||
'genres': ['Alternative'],
|
'genres': ['Alternative'],
|
||||||
'release_year': 2017,
|
'release_year': 2017,
|
||||||
'modified_timestamp': int,
|
|
||||||
'modified_date': str,
|
|
||||||
'view_count': int,
|
|
||||||
},
|
},
|
||||||
'playlist_count': 10,
|
'playlist_count': 10,
|
||||||
'params': {
|
'params': {
|
||||||
@ -1052,72 +1049,96 @@ class VKMusicPlaylistIE(VKMusicBaseIE):
|
|||||||
'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0',
|
'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0',
|
||||||
'uploader': 'VK Музыка',
|
'uploader': 'VK Музыка',
|
||||||
'thumbnails': [{'url': r're:https?://.*\.jpg'}],
|
'thumbnails': [{'url': r're:https?://.*\.jpg'}],
|
||||||
'modified_timestamp': int,
|
|
||||||
'modified_date': str,
|
|
||||||
'view_count': int,
|
|
||||||
},
|
},
|
||||||
'playlist_count': 18,
|
'playlist_count': 18,
|
||||||
'params': {
|
'params': {
|
||||||
'skip_download': True,
|
'skip_download': True,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
'url': 'https://vk.com/music/playlist/-147845620_2206_876b2b81e867a433c2',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '-147845620_2206',
|
||||||
|
'title': 'Электроника: новинки',
|
||||||
|
'description': 're:^Актуальные новинки электронной музыки',
|
||||||
|
'uploader': 'VK Музыка',
|
||||||
|
'thumbnails': [{'url': r're:https?://.*\.jpg'}],
|
||||||
|
},
|
||||||
|
'playlist_mincount': 100,
|
||||||
|
'params': {
|
||||||
|
'extract_flat': True, # DO NOT process 150+ entries
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = self._match_valid_url(url)
|
mobj = self._match_valid_url(url)
|
||||||
playlist_id = mobj.group('full_id')
|
playlist_id = mobj.group('full_id')
|
||||||
|
access_hash = mobj.group('hash')
|
||||||
|
|
||||||
meta = self._download_payload('al_audio', playlist_id, {
|
hash_in_url = f'_{access_hash}' if access_hash else ''
|
||||||
'act': 'load_section',
|
webpage = self._download_webpage(
|
||||||
'access_hash': mobj.group('hash') or '',
|
f'https://vk.com/music/album/{playlist_id}{hash_in_url}',
|
||||||
'claim': '0',
|
playlist_id)
|
||||||
'context': '',
|
del hash_in_url
|
||||||
'from_id': self._parse_vk_id(),
|
|
||||||
'is_loading_all': '1',
|
html = get_element_html_by_class('AudioPlaylistSnippet', webpage)
|
||||||
'is_preload': '0',
|
del webpage
|
||||||
'offset': '0',
|
|
||||||
'owner_id': mobj.group('oid'),
|
|
||||||
'playlist_id': mobj.group('id'),
|
|
||||||
'ref': '',
|
|
||||||
'type': 'playlist',
|
|
||||||
})[0]
|
|
||||||
tracks = meta['list']
|
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
for ent in tracks:
|
|
||||||
info = self._parse_track_meta(ent)
|
for mobj in re.finditer(r'data-audio="([^"]+)', html):
|
||||||
|
meta = self._parse_json(
|
||||||
|
unescapeHTML(mobj.group(1)),
|
||||||
|
playlist_id, fatal=False)
|
||||||
|
if not meta:
|
||||||
|
continue
|
||||||
|
|
||||||
|
info = self._parse_track_meta(meta)
|
||||||
track_id = info.pop('id')
|
track_id = info.pop('id')
|
||||||
title = info.pop('title')
|
title = info.pop('title')
|
||||||
|
|
||||||
ent_hash = f'_{ent[24]}' if len(ent) >= 24 and ent[24] else ''
|
hash_in_url = f'_{meta[24]}' if len(meta) >= 24 and meta[24] else ''
|
||||||
audio_url = f'https://vk.com/audio{track_id}{ent_hash}'
|
audio_url = f'https://vk.com/audio{track_id}{hash_in_url}'
|
||||||
|
|
||||||
entries.append(self.url_result(
|
entries.append(self.url_result(
|
||||||
audio_url, VKMusicTrackIE, track_id, title, **info))
|
audio_url, VKMusicTrackIE, track_id, title, **info))
|
||||||
|
|
||||||
title = unescapeHTML(meta.get('title'))
|
title = self._html_search_regex(
|
||||||
artist = unescapeHTML(meta.get('authorName'))
|
r'class="[^"]*AudioPlaylistSnippet__title--main[^"]*"[^>]*>([^<]+)',
|
||||||
|
html, 'playlist title', fatal=False, group=1)
|
||||||
|
|
||||||
|
artist = self._html_search_regex(
|
||||||
|
r'class="[^"]*AudioPlaylistSnippet__author[^"]*"[^>]*>\s*<a(?:\s[^>]*)?>([^<]+)',
|
||||||
|
html, 'playlist author', fatal=False, group=1)
|
||||||
|
|
||||||
|
description = clean_html(get_element_by_class(
|
||||||
|
'AudioPlaylistSnippet__description', html))
|
||||||
|
# description = self._html_search_regex(
|
||||||
|
# r'div\s[^>]*class="[^"]*AudioPlaylistSnippet__description[^"]*">??????',
|
||||||
|
# html, 'playlist description', fatal=False, group=1)
|
||||||
|
|
||||||
|
genre, year = self._html_search_regex(
|
||||||
|
r'class="[^"]*AudioPlaylistSnippet__info[^"]*"[^>]*>\s*(.+) .*;(\d+)\s*</',
|
||||||
|
html, 'genre and release year', default=(None, None), group=(1, 2))
|
||||||
|
|
||||||
genre, year = self._search_regex(
|
|
||||||
r'^([^<]+)<\s*span[^>]*>[^<]*</\s*span\s*>(\d+)$',
|
|
||||||
meta.get('infoLine1'), 'genre and release year',
|
|
||||||
default=(None, None), fatal=False, group=(1, 2))
|
|
||||||
is_album = year is not None
|
is_album = year is not None
|
||||||
|
|
||||||
thumbnail = url_or_none(meta.get('coverUrl'))
|
thumbnail = url_or_none(self._html_search_regex(
|
||||||
|
r'class="[^"]*AudioPlaylistSnippet__cover[^"]*"[^>]*style="background-image\s*:\s*url\s*\(\s*\'([^\']+)',
|
||||||
|
html, 'playlist thumbnail', fatal=False, group=1))
|
||||||
|
|
||||||
return self.playlist_result(
|
return self.playlist_result(
|
||||||
entries, playlist_id,
|
entries, playlist_id,
|
||||||
join_nonempty(artist, title, delim=' - ') if is_album else title,
|
join_nonempty(artist, title, delim=' - ') if is_album else title,
|
||||||
unescapeHTML(meta.get('rawDescription')),
|
description,
|
||||||
album=title if is_album else None,
|
album=title if is_album else None,
|
||||||
uploader=artist,
|
uploader=artist,
|
||||||
artists=[artist] if is_album else None,
|
artists=[artist] if is_album else None,
|
||||||
thumbnails=[{'url': thumbnail}] if thumbnail else [],
|
thumbnails=[{'url': thumbnail}] if thumbnail else [],
|
||||||
genres=[unescapeHTML(genre)] if genre else None,
|
genres=[genre] if genre else None,
|
||||||
release_year=int_or_none(year),
|
release_year=int_or_none(year))
|
||||||
modified_timestamp=int_or_none(meta.get('lastUpdated')),
|
|
||||||
view_count=int_or_none(meta.get('listens')))
|
|
||||||
|
|
||||||
|
|
||||||
class VKPlayBaseIE(InfoExtractor):
|
class VKPlayBaseIE(InfoExtractor):
|
||||||
|
Loading…
Reference in New Issue
Block a user