1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-06-28 09:28:33 +00:00

refactor: split into base class, track ie, playlist ie

This commit is contained in:
DarkCat09 2025-04-04 16:56:46 +04:00
parent 705bb0ba5f
commit cd282aae39
No known key found for this signature in database

View File

@ -756,11 +756,102 @@ def _real_extract(self, url):
clean_html(get_element_by_class('wall_post_text', webpage))) clean_html(get_element_by_class('wall_post_text', webpage)))
class VKMusicIE(VKBaseIE): class VKMusicBaseIE(VKBaseIE):
IE_NAME = 'vk:music' _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/='
# Debug and test on https://regexr.com/8dlot def _b64_decode(self, enc):
_VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P<track_id>-?\d+_\d+)|(?:.*[\?&](?:act|z)=audio_playlist|music/[a-z]+/)(?P<playlist_id>(?P<pl_oid>-?\d+)_(?P<pl_id>\d+)))(?:(?:%2F|_|[?&]access_hash=)(?P<access_hash>[0-9a-f]+))?' dec = ''
e = n = 0
for c in enc:
r = self._BASE64_CHARS.index(c)
cond = n % 4
e = 64 * e + r if cond else r
n += 1
if cond:
dec += chr(255 & e >> (-2 * n & 6))
return dec
def _unmask_url(self, mask_url, vk_id):
if 'audio_api_unavailable' in mask_url:
extra = mask_url.split('?extra=')[1].split('#')
_, base = self._b64_decode(extra[1]).split(chr(11))
mask_url = list(self._b64_decode(extra[0]))
url_len = len(mask_url)
indexes = [None] * url_len
index = int(base) ^ vk_id
for n in range(url_len - 1, -1, -1):
index = (url_len * (n + 1) ^ index + n) % url_len
indexes[n] = index
for n in range(1, url_len):
c = mask_url[n]
index = indexes[url_len - 1 - n]
mask_url[n] = mask_url[index]
mask_url[index] = c
mask_url = ''.join(mask_url)
return mask_url
def _parse_track_meta(self, meta, track_id=None):
len_ = len(meta)
info = {}
info['id'] = f'{meta[1]}_{meta[0]}' \
if len_ >= 2 and meta[1] and meta[0] \
else track_id
title = unescapeHTML(meta[3]) if len_ >= 3 else None # TODO: fallback
artist = unescapeHTML(meta[4]) if len_ >= 4 else None # artists in one string, may include "feat."
info['title'] = join_nonempty(artist, title, delim=' - ')
info['track'] = title
info['uploader'] = artist
# artists as list
info['artists'] = (
# not htmlescaped unlike meta[4]
traverse_obj((*meta[17], *meta[18]), (..., 'name'))
if len_ >= 18 else None
) or [artist]
info['duration'] = int_or_none(meta[5]) if len_ >= 5 else None
info['thumbnails'] = [{'url': meta[14]}] if len_ >= 14 else []
# meta[30] is 2 bits
# most significant: isExplicit
# least significant: isForeignAgent
# i. e.
# 00 = safe
# 01 = marked by RKN as "foreign agent"
# 10 = explicit lyrics
# 11 = both E lyrics and "foreign agent"
if len_ >= 30 and meta[30]:
info['age_limit'] = 18
return info
def _raise_if_blocked(self, meta, track_id):
reason = traverse_obj(
self._parse_json(
meta[12] if len(meta) >= 12 else None,
track_id, fatal=False),
('claim', 'reason'))
if reason == 'geo':
self.raise_geo_restricted()
# can be an empty string
elif reason is not None:
raise ExtractorError(
'This track is unavailable. '
f'Reason code: {reason:r}')
class VKMusicTrackIE(VKMusicBaseIE):
IE_NAME = 'vkmusic:track'
_VALID_URL = r'''(?x)
https?://
(?:(?:m|new)\.)?vk\.(?:com|ru)/
audio(?P<id>-?\d+_\d+)
(?:(?:%2F|_)(?P<hash>[0-9a-f]+))?
'''
_TESTS = [ _TESTS = [
{ {
@ -814,27 +905,6 @@ class VKMusicIE(VKBaseIE):
'skip_download': True, 'skip_download': True,
}, },
}, },
{
'url': 'https://vk.com/artist/linkinpark/releases?z=audio_playlist-2000984503_984503%2Fc468f3a862b6f73b55',
'info_dict': {
'id': '-2000984503_984503',
'title': 'Linkin Park - One More Light',
'description': '',
'album': 'One More Light',
'uploader': 'Linkin Park',
'artists': ['Linkin Park'],
'thumbnail': r're:https?://.*\.jpg',
'genres': ['Alternative'],
'release_year': 2017,
'modified_timestamp': int,
'modified_date': str,
'view_count': int,
},
'playlist_count': 10,
'params': {
'skip_download': True,
},
},
{ {
'note': 'special symbols in title and artist must be unescaped', 'note': 'special symbols in title and artist must be unescaped',
'url': 'https://vk.com/audio-2001069891_6069891', 'url': 'https://vk.com/audio-2001069891_6069891',
@ -852,84 +922,13 @@ class VKMusicIE(VKBaseIE):
'skip_download': True, 'skip_download': True,
}, },
}, },
{
'url': 'https://vk.com/audios877252112?block=playlists&section=general&z=audio_playlist-147845620_2390',
'info_dict': {
'id': '-147845620_2390',
'title': 'VK Fest 2024: Белая сцена',
'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0',
'uploader': 'VK Музыка',
'thumbnail': r're:https?://.*\.jpg',
'modified_timestamp': int,
'modified_date': str,
'view_count': int,
},
'playlist_count': 18,
'params': {
'skip_download': True,
},
},
] ]
def _parse_track_meta(self, meta, track_id=None):
len_ = len(meta)
info = {}
info['id'] = f'{meta[1]}_{meta[0]}' \
if len_ >= 2 and meta[1] and meta[0] \
else track_id
title = unescapeHTML(meta[3]) if len_ >= 3 else None # TODO: fallback
artist = unescapeHTML(meta[4]) if len_ >= 4 else None # artists in one string, may include "feat."
info['title'] = join_nonempty(artist, title, delim=' - ')
info['track'] = title
info['uploader'] = artist
# artists as list
info['artists'] = (
# not htmlescaped unlike meta[4]
traverse_obj((*meta[17], *meta[18]), (..., 'name'))
if len_ >= 18 else None
) or [artist]
info['duration'] = int_or_none(meta[5]) if len_ >= 5 else None
info['thumbnails'] = [{'url': meta[14]}] if len_ >= 14 else []
# meta[30] is 2 bits
# most significant: isExplicit
# least significant: isForeignAgent
# i. e.
# 00 = safe
# 01 = marked by RKN as "foreign agent"
# 10 = explicit lyrics
# 11 = both E lyrics and "foreign agent"
if len_ >= 30 and meta[30]:
info['age_limit'] = 18
return info
def _raise_if_blocked(self, meta, track_id):
reason = traverse_obj(
self._parse_json(
meta[12] if len(meta) >= 12 else None,
track_id, fatal=False),
('claim', 'reason'))
if reason == 'geo':
self.raise_geo_restricted()
# can be an empty string
elif reason is not None:
raise ExtractorError(
'This track is unavailable. '
f'Reason code: {reason:r}')
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
track_id = mobj.group('track_id') track_id = mobj.group('id')
playlist_id = mobj.group('playlist_id') access_hash = mobj.group('hash')
access_hash = mobj.group('access_hash')
if track_id:
if not access_hash: if not access_hash:
webpage = self._download_webpage(url, track_id) webpage = self._download_webpage(url, track_id)
@ -972,10 +971,8 @@ def _real_extract(self, url):
self.raise_login_required() self.raise_login_required()
meta = meta[0] meta = meta[0]
self._raise_if_blocked(meta, track_id) self._raise_if_blocked(meta, track_id)
url = self._unmask_url(meta[2], self._parse_vk_id())
url = _unmask_url(meta[2], self._parse_vk_id())
return { return {
**self._parse_track_meta(meta, track_id), **self._parse_track_meta(meta, track_id),
@ -988,35 +985,93 @@ def _real_extract(self, url):
}], }],
} }
elif playlist_id:
class VKMusicPlaylistIE(VKMusicBaseIE):
IE_NAME = 'vkmusic:playlist'
_VALID_URL = r'''(?x)
https?://
(?:(?:m|new)\.)?vk\.(?:com|ru)/
(?:
music/(?:album|playlist)/|
.*[?&](?:act|z)=audio_playlist
)
(?P<full_id>(?P<oid>-?\d+)_(?P<id>\d+))
(?:(?:%2F|_|[?&]access_hash=)(?P<hash>[0-9a-f]+))?
'''
_TESTS = [
{
'url': 'https://vk.com/artist/linkinpark/releases?z=audio_playlist-2000984503_984503%2Fc468f3a862b6f73b55',
'info_dict': {
'id': '-2000984503_984503',
'title': 'Linkin Park - One More Light',
'description': '',
'album': 'One More Light',
'uploader': 'Linkin Park',
'artists': ['Linkin Park'],
'thumbnail': r're:https?://.*\.jpg',
'genres': ['Alternative'],
'release_year': 2017,
'modified_timestamp': int,
'modified_date': str,
'view_count': int,
},
'playlist_count': 10,
'params': {
'skip_download': True,
},
},
{
'url': 'https://vk.com/audios877252112?block=playlists&section=general&z=audio_playlist-147845620_2390',
'info_dict': {
'id': '-147845620_2390',
'title': 'VK Fest 2024: Белая сцена',
'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0',
'uploader': 'VK Музыка',
'thumbnail': r're:https?://.*\.jpg',
'modified_timestamp': int,
'modified_date': str,
'view_count': int,
},
'playlist_count': 18,
'params': {
'skip_download': True,
},
},
]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
playlist_id = mobj.group('full_id')
meta = self._download_payload('al_audio', playlist_id, { meta = self._download_payload('al_audio', playlist_id, {
'act': 'load_section', 'act': 'load_section',
'access_hash': access_hash or '', 'access_hash': mobj.group('hash') or '',
'claim': '0', 'claim': '0',
'context': '', 'context': '',
'from_id': self._parse_vk_id(), 'from_id': self._parse_vk_id(),
'is_loading_all': '1', 'is_loading_all': '1',
'is_preload': '0', 'is_preload': '0',
'offset': '0', 'offset': '0',
'owner_id': mobj.group('pl_oid'), 'owner_id': mobj.group('oid'),
'playlist_id': mobj.group('pl_id'), 'playlist_id': mobj.group('id'),
'ref': '', 'ref': '',
'type': 'playlist', 'type': 'playlist',
})[0] })[0]
tracks = meta['list'] tracks = meta['list']
entries = [] entries = []
for ent in tracks: for ent in tracks:
info = self._parse_track_meta(ent) info = self._parse_track_meta(ent)
ent_access = f'_{ent[24]}' if len(ent) >= 24 and ent[24] else ''
track_id = info.pop('id') track_id = info.pop('id')
title = info.pop('title') title = info.pop('title')
audio_url = f'https://vk.com/audio{track_id}{ent_access}'
ent_hash = f'_{ent[24]}' if len(ent) >= 24 and ent[24] else ''
audio_url = f'https://vk.com/audio{track_id}{ent_hash}'
entries.append(self.url_result( entries.append(self.url_result(
audio_url, VKMusicIE, track_id, title, **info)) audio_url, VKMusicTrackIE, track_id, title, **info))
title = unescapeHTML(meta.get('title')) # TODO: fallback title = unescapeHTML(meta.get('title')) # TODO: fallback
artist = unescapeHTML(meta.get('authorName')) artist = unescapeHTML(meta.get('authorName'))
@ -1187,39 +1242,3 @@ def _real_extract(self, url):
**self._extract_common_meta(stream_info), **self._extract_common_meta(stream_info),
'formats': formats, 'formats': formats,
} }
_BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/='
def _b64_decode(enc):
dec = ''
e = n = 0
for c in enc:
r = _BASE64_CHARS.index(c)
cond = n % 4
e = 64 * e + r if cond else r
n += 1
if cond:
dec += chr(255 & e >> (-2 * n & 6))
return dec
def _unmask_url(mask_url, vk_id):
if 'audio_api_unavailable' in mask_url:
extra = mask_url.split('?extra=')[1].split('#')
func, base = _b64_decode(extra[1]).split(chr(11))
mask_url = list(_b64_decode(extra[0]))
url_len = len(mask_url)
indexes = [None] * url_len
index = int(base) ^ vk_id
for n in range(url_len - 1, -1, -1):
index = (url_len * (n + 1) ^ index + n) % url_len
indexes[n] = index
for n in range(1, url_len):
c = mask_url[n]
index = indexes[url_len - 1 - n]
mask_url[n] = mask_url[index]
mask_url[index] = c
mask_url = ''.join(mask_url)
return mask_url