From 66e6485bc9430ebed6c6f406d16c7e936338a447 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 21 Mar 2025 22:42:22 +0400 Subject: [PATCH 01/51] feat: implement vkmusic extractor for tracks --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/vk.py | 77 +++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 74a043b9c8..3c9299a922 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2382,6 +2382,7 @@ ) from .vk import ( VKIE, + VKMusicIE, VKPlayIE, VKPlayLiveIE, VKUserVideosIE, diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index faf3e60b0b..141200393e 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -13,6 +13,7 @@ ExtractorError, UserNotLive, clean_html, + extract_attributes, get_element_by_class, get_element_html_by_id, int_or_none, @@ -712,6 +713,9 @@ def _decode(self, enc): dec += chr(255 & e >> (-2 * n & 6)) return dec + # source: + # https://st7-20.vk.com/dist/web/chunks/common.bd7ad7e2.js + # search here for: e.split('?extra=') [1].split('#') def _unmask_url(self, mask_url, vk_id): if 'audio_api_unavailable' in mask_url: extra = mask_url.split('?extra=')[1].split('#') @@ -775,6 +779,79 @@ def _real_extract(self, url): clean_html(get_element_by_class('wall_post_text', webpage))) +class VKMusicIE(VKBaseIE): + IE_NAME = 'vk:music' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*\?z=audio_playlist|music/playlist/)(?P-?\d+_\d+))' + _TESTS = [ + { + 'url': 'https://vk.com/audio-2001746599_34746599', + 'info_dict': { + 'id': '-2001746599_34746599', + 'ext': 'm4a', + 'title': 'Skillet - Feel Invincible', + 'duration': 230, + 'uploader': 'Skillet', + 'artist': 'Skillet', + 'track': 'Feel Invincible', + }, + 'params': { + 'skip_download': True, + } + } + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + track_id = mobj.group('track_id') + playlist_id = mobj.group('playlist_id') + + sui = self._get_cookies('https://login.vk.com')['sui'].value + vk_id = re.match(r'^\d+', sui)[0] + + # TODO: playlist + + if track_id: + webpage = self._download_webpage(url, track_id) + data_exec = extract_attributes( + get_element_by_class('AudioPlayerBlock__root', webpage), + )['data-exec'] + + meta = self._parse_json(data_exec, track_id)['AudioPlayerBlock/init']['firstAudio'] + one_more_id = meta[24] + + del data_exec + del webpage + + track = self._download_payload('al_audio', track_id, { + 'act': 'reload_audios', + 'audio_ids': f'{track_id}_{one_more_id}' + }) + + meta = self._parse_json(track, track_id)[0][0] + url = VKWallPostIE()._unmask_url(meta[2], vk_id) + title = meta[3] + artist = meta[4] + thumbnail = meta[14] + + return { + 'id': track_id, + 'title': join_nonempty(artist, title, delim=' - '), + 'thumbnails': [thumbnail], + 'duration': int_or_none(meta[5]), + 'uploader': artist, # XXX: we don't have an uploader in player meta + 'artist': artist, + 'track': title, + 'formats': [{ + 'url': url, + # XXX: copied from VKWallPostIE._real_extract + 'ext': 'm4a', + 'vcodec': 'none', + 'acodec': 'mp3', + 'container': 'm4a_dash', + }], + } + + class VKPlayBaseIE(InfoExtractor): _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vk(?:play|video)\.ru)/' _RESOLUTIONS = { From 66994d979e3a42a84b942cf8540a2c115632206a Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 21 Mar 2025 23:01:23 +0400 Subject: [PATCH 02/51] refactor: move vk_id to base class, audio url decoder to module --- yt_dlp/extractor/vk.py | 80 ++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 141200393e..dbc6bada4c 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -74,6 +74,12 @@ def _perform_login(self, username, password): raise ExtractorError( 'Unable to login, incorrect username and/or password', expected=True) + def _parse_vk_id(self): + sui = self._get_cookies('https://login.vk.com')['sui'].value + # example of what `sui` cookie contains: + # 123456789%2CSaCxka2wNY7OZKE5QkmtVTxCxg6Ftgb-zVgNXvMVWQH + return re.match(r'^\d+', sui)[0] + def _download_payload(self, path, video_id, data, fatal=True): endpoint = f'https://vk.com/{path}.php' data['al'] = 1 @@ -698,43 +704,8 @@ class VKWallPostIE(VKBaseIE): 'url': 'https://m.vk.com/wall-23538238_35', 'only_matching': True, }] - _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' _AUDIO = collections.namedtuple('Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads']) - def _decode(self, enc): - dec = '' - e = n = 0 - for c in enc: - r = self._BASE64_CHARS.index(c) - cond = n % 4 - e = 64 * e + r if cond else r - n += 1 - if cond: - dec += chr(255 & e >> (-2 * n & 6)) - return dec - - # source: - # https://st7-20.vk.com/dist/web/chunks/common.bd7ad7e2.js - # search here for: e.split('?extra=') [1].split('#') - def _unmask_url(self, mask_url, vk_id): - if 'audio_api_unavailable' in mask_url: - extra = mask_url.split('?extra=')[1].split('#') - func, base = self._decode(extra[1]).split(chr(11)) - mask_url = list(self._decode(extra[0])) - url_len = len(mask_url) - indexes = [None] * url_len - index = int(base) ^ vk_id - for n in range(url_len - 1, -1, -1): - index = (url_len * (n + 1) ^ index + n) % url_len - indexes[n] = index - for n in range(1, url_len): - c = mask_url[n] - index = indexes[url_len - 1 - n] - mask_url[n] = mask_url[index] - mask_url[index] = c - mask_url = ''.join(mask_url) - return mask_url - def _real_extract(self, url): post_id = self._match_id(url) @@ -805,9 +776,6 @@ def _real_extract(self, url): track_id = mobj.group('track_id') playlist_id = mobj.group('playlist_id') - sui = self._get_cookies('https://login.vk.com')['sui'].value - vk_id = re.match(r'^\d+', sui)[0] - # TODO: playlist if track_id: @@ -828,7 +796,7 @@ def _real_extract(self, url): }) meta = self._parse_json(track, track_id)[0][0] - url = VKWallPostIE()._unmask_url(meta[2], vk_id) + url = _unmask_url(meta[2], self._parse_vk_id()) title = meta[3] artist = meta[4] thumbnail = meta[14] @@ -999,3 +967,37 @@ def _real_extract(self, url): **self._extract_common_meta(stream_info), 'formats': formats, } + + +_BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' + +def _b64_decode(enc): + dec = '' + e = n = 0 + for c in enc: + r = _BASE64_CHARS.index(c) + cond = n % 4 + e = 64 * e + r if cond else r + n += 1 + if cond: + dec += chr(255 & e >> (-2 * n & 6)) + return dec + +def _unmask_url(mask_url, vk_id): + if 'audio_api_unavailable' in mask_url: + extra = mask_url.split('?extra=')[1].split('#') + func, base = _b64_decode(extra[1]).split(chr(11)) + mask_url = list(_b64_decode(extra[0])) + url_len = len(mask_url) + indexes = [None] * url_len + index = int(base) ^ vk_id + for n in range(url_len - 1, -1, -1): + index = (url_len * (n + 1) ^ index + n) % url_len + indexes[n] = index + for n in range(1, url_len): + c = mask_url[n] + index = indexes[url_len - 1 - n] + mask_url[n] = mask_url[index] + mask_url[index] = c + mask_url = ''.join(mask_url) + return mask_url From 2d246669155ab6f03eb52a1293c534b2154a83d0 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 21 Mar 2025 23:50:30 +0400 Subject: [PATCH 03/51] feat: playlist extractor --- yt_dlp/extractor/vk.py | 51 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index dbc6bada4c..c9f6563482 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -776,8 +776,6 @@ def _real_extract(self, url): track_id = mobj.group('track_id') playlist_id = mobj.group('playlist_id') - # TODO: playlist - if track_id: webpage = self._download_webpage(url, track_id) data_exec = extract_attributes( @@ -819,6 +817,55 @@ def _real_extract(self, url): }], } + elif playlist_id: + playlist = self._download_payload('al_audio', playlist_id, { + 'act': 'load_section', + 'access_hash': '', # TODO: unnecessary, but it's better to parse from url if access_hash is present + 'claim': '0', + 'context': '', + 'from_id': self._parse_vk_id(), # TODO: or '0' + 'is_loading_all': '1', + 'is_preload': '0', + 'offset': '0', + 'owner_id': '', + 'playlist_id': '', + 'ref': '', + 'type': 'playlist', + }) + + meta = self._parse_json(playlist, playlist_id)[0] + tracks = meta['list'] + + entries = [] + for ent in tracks: + # XXX: repeating code + # meta-parsers for track and playlist items should be unified + + title = ent[3] + artist = ent[4] + + track_id = f'{ent[1]}_{ent[0]}' + audio_url = f'https://vk.com/audio{track_id}' + + entries.append(self.url_result( + audio_url, VKMusicIE, track_id, + join_nonempty(artist, title, delim=' - '), + track=title, artist=artist, uploader=artist, + duration=int_or_none(ent[5]), + thumbnails=[meta[14]] + )) + + artist = meta.get('authorName') + thumbnail = meta.get('coverUrl') + return self.playlist_result( + entries, playlist_id, + meta.get('title'), # TODO: maybe also "artist - title"? + meta.get('description'), + uploader=artist, artist=artist, + thumbnails=[thumbnail] if thumbnail else None, + # TODO: there are even more useful metadata + ) + class VKPlayBaseIE(InfoExtractor): _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vk(?:play|video)\.ru)/' From 9d2ea17fda915e2bfcf8092638992dde51dc4db0 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 22 Mar 2025 00:07:28 +0400 Subject: [PATCH 04/51] fix: when not authenticated, html is different --- yt_dlp/extractor/vk.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index c9f6563482..a7d24e4b24 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -778,14 +778,14 @@ def _real_extract(self, url): if track_id: webpage = self._download_webpage(url, track_id) - data_exec = extract_attributes( - get_element_by_class('AudioPlayerBlock__root', webpage), - )['data-exec'] - meta = self._parse_json(data_exec, track_id)['AudioPlayerBlock/init']['firstAudio'] + # copied regex from VKWallPostIE + # XXX: common code should be unified, moved to a class + data_audio = re.search(r'data-audio="([^"]+)', webpage)[1] + meta = self._parse_json(unescapeHTML(data_audio), track_id) one_more_id = meta[24] - del data_exec + del data_audio del webpage track = self._download_payload('al_audio', track_id, { From 415b5c60655e5911ea0aab5e0e06e124b106d364 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 22 Mar 2025 00:08:35 +0400 Subject: [PATCH 05/51] fix: _download_payload already gives json --- yt_dlp/extractor/vk.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index a7d24e4b24..8b87d9308c 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -788,12 +788,11 @@ def _real_extract(self, url): del data_audio del webpage - track = self._download_payload('al_audio', track_id, { + meta = self._download_payload('al_audio', track_id, { 'act': 'reload_audios', 'audio_ids': f'{track_id}_{one_more_id}' - }) + })[0][0] - meta = self._parse_json(track, track_id)[0][0] url = _unmask_url(meta[2], self._parse_vk_id()) title = meta[3] artist = meta[4] From 69660817088b298decd205a1f44443b6b1244ec8 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 22 Mar 2025 00:12:43 +0400 Subject: [PATCH 06/51] chore: cleanup imports --- yt_dlp/extractor/vk.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 8b87d9308c..b420f7a1ed 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -13,7 +13,6 @@ ExtractorError, UserNotLive, clean_html, - extract_attributes, get_element_by_class, get_element_html_by_id, int_or_none, From 5dd61b95b899b458e46d8bc92ccc82d4bba70356 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 22 Mar 2025 00:14:28 +0400 Subject: [PATCH 07/51] fix: vk_id is 0 when not authenticated --- yt_dlp/extractor/vk.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index b420f7a1ed..e68cf7082c 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -74,10 +74,17 @@ def _perform_login(self, username, password): 'Unable to login, incorrect username and/or password', expected=True) def _parse_vk_id(self): - sui = self._get_cookies('https://login.vk.com')['sui'].value + sui = self._get_cookies('https://login.vk.com').get('sui') + if not sui: + return 0 + # example of what `sui` cookie contains: # 123456789%2CSaCxka2wNY7OZKE5QkmtVTxCxg6Ftgb-zVgNXvMVWQH - return re.match(r'^\d+', sui)[0] + mobj = re.match(r'^\d+', sui.value) + if not mobj: + return 0 + + return int(mobj[1]) def _download_payload(self, path, video_id, data, fatal=True): endpoint = f'https://vk.com/{path}.php' @@ -821,7 +828,7 @@ def _real_extract(self, url): 'access_hash': '', # TODO: unnecessary, but it's better to parse from url if access_hash is present 'claim': '0', 'context': '', - 'from_id': self._parse_vk_id(), # TODO: or '0' + 'from_id': self._parse_vk_id(), 'is_loading_all': '1', 'is_preload': '0', 'offset': '0', From 04111cb6b50dd1f992d4d90055680d72d27d34d5 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 22 Mar 2025 00:16:37 +0400 Subject: [PATCH 08/51] fix: make test pass --- yt_dlp/extractor/vk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index e68cf7082c..ae7db78799 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -769,6 +769,7 @@ class VKMusicIE(VKBaseIE): 'duration': 230, 'uploader': 'Skillet', 'artist': 'Skillet', + 'artists': ['Skillet'], 'track': 'Feel Invincible', }, 'params': { @@ -807,7 +808,7 @@ def _real_extract(self, url): return { 'id': track_id, 'title': join_nonempty(artist, title, delim=' - '), - 'thumbnails': [thumbnail], + # 'thumbnails': [thumbnail], 'duration': int_or_none(meta[5]), 'uploader': artist, # XXX: we don't have an uploader in player meta 'artist': artist, @@ -857,7 +858,7 @@ def _real_extract(self, url): join_nonempty(artist, title, delim=' - '), track=title, artist=artist, uploader=artist, duration=int_or_none(ent[5]), - thumbnails=[meta[14]] + # thumbnails=[meta[14]] )) artist = meta.get('authorName') From 6f0c64b148994f096cf7c631dd2721c14fc5fbb8 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 22 Mar 2025 00:28:12 +0400 Subject: [PATCH 09/51] feat: parse access hash from url --- yt_dlp/extractor/vk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index ae7db78799..0c85725b66 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -758,7 +758,7 @@ def _real_extract(self, url): class VKMusicIE(VKBaseIE): IE_NAME = 'vk:music' - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*\?z=audio_playlist|music/playlist/)(?P-?\d+_\d+))' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*\?z=audio_playlist|music/[a-z]+/)(?P-?\d+_\d+)(?:(?:%2F|_)(?P[0-9a-f]+))?)' _TESTS = [ { 'url': 'https://vk.com/audio-2001746599_34746599', @@ -782,6 +782,7 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) track_id = mobj.group('track_id') playlist_id = mobj.group('playlist_id') + access_hash = mobj.group('access_hash') or '' if track_id: webpage = self._download_webpage(url, track_id) @@ -826,7 +827,7 @@ def _real_extract(self, url): elif playlist_id: playlist = self._download_payload('al_audio', playlist_id, { 'act': 'load_section', - 'access_hash': '', # TODO: unnecessary, but it's better to parse from url if access_hash is present + 'access_hash': access_hash, 'claim': '0', 'context': '', 'from_id': self._parse_vk_id(), From 0d9e92a4560e4263f8935d59376fc3e67175cc4b Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 22 Mar 2025 14:19:26 +0400 Subject: [PATCH 10/51] style: hatch fmt --- yt_dlp/extractor/vk.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 0c85725b66..888ffabb01 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -774,8 +774,8 @@ class VKMusicIE(VKBaseIE): }, 'params': { 'skip_download': True, - } - } + }, + }, ] def _real_extract(self, url): @@ -798,7 +798,7 @@ def _real_extract(self, url): meta = self._download_payload('al_audio', track_id, { 'act': 'reload_audios', - 'audio_ids': f'{track_id}_{one_more_id}' + 'audio_ids': f'{track_id}_{one_more_id}', })[0][0] url = _unmask_url(meta[2], self._parse_vk_id()) @@ -1025,6 +1025,7 @@ def _real_extract(self, url): _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' + def _b64_decode(enc): dec = '' e = n = 0 @@ -1037,6 +1038,7 @@ def _b64_decode(enc): dec += chr(255 & e >> (-2 * n & 6)) return dec + def _unmask_url(mask_url, vk_id): if 'audio_api_unavailable' in mask_url: extra = mask_url.split('?extra=')[1].split('#') From 65724fe40899f7d0636403b0382a3047793703ba Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 14:57:33 +0400 Subject: [PATCH 11/51] style: re.search -> self._search_regex --- yt_dlp/extractor/vk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 888ffabb01..4be84b2807 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -789,7 +789,9 @@ def _real_extract(self, url): # copied regex from VKWallPostIE # XXX: common code should be unified, moved to a class - data_audio = re.search(r'data-audio="([^"]+)', webpage)[1] + data_audio = self._search_regex( + r'data-audio="([^"]+)', webpage, 'data-audio attr', group=1) + meta = self._parse_json(unescapeHTML(data_audio), track_id) one_more_id = meta[24] From 84c4dc1b2019b1e97975aeefd4a2bf825d75f1b0 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 14:57:59 +0400 Subject: [PATCH 12/51] fix: vk_id cookie regex group --- yt_dlp/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 4be84b2807..d65cc5cf2d 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -84,7 +84,7 @@ def _parse_vk_id(self): if not mobj: return 0 - return int(mobj[1]) + return int(mobj[0]) def _download_payload(self, path, video_id, data, fatal=True): endpoint = f'https://vk.com/{path}.php' From 5fb99a09ea1617159af993e452fadd04dc3ffa08 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 15:42:21 +0400 Subject: [PATCH 13/51] refactor: move meta parser to separate function --- yt_dlp/extractor/vk.py | 47 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index d65cc5cf2d..239411fc11 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -778,6 +778,27 @@ class VKMusicIE(VKBaseIE): }, ] + def _parse_track_meta(self, meta, track_id=None): + len_ = len(meta) + info = {} + + info['id'] = track_id \ + if len_ < 2 or not meta[1] or not meta[0] \ + else f'{meta[1]}_{meta[0]}' + + title = meta[3] if len_ >= 3 else None + artist = meta[4] if len_ >= 4 else None + info['title'] = join_nonempty(artist, title, delim=' - ') + if title: + info['track'] = title + if artist: + info['artist'] = info['uploader'] = artist + + info['duration'] = int_or_none(meta[5]) if len_ >= 5 else None + # info['thumbnail'] = meta[14] if len_ >= 14 else None + + return info + def _real_extract(self, url): mobj = self._match_valid_url(url) track_id = mobj.group('track_id') @@ -804,18 +825,9 @@ def _real_extract(self, url): })[0][0] url = _unmask_url(meta[2], self._parse_vk_id()) - title = meta[3] - artist = meta[4] - thumbnail = meta[14] return { - 'id': track_id, - 'title': join_nonempty(artist, title, delim=' - '), - # 'thumbnails': [thumbnail], - 'duration': int_or_none(meta[5]), - 'uploader': artist, # XXX: we don't have an uploader in player meta - 'artist': artist, - 'track': title, + **self._parse_track_meta(meta, track_id), 'formats': [{ 'url': url, # XXX: copied from VKWallPostIE._real_extract @@ -850,19 +862,14 @@ def _real_extract(self, url): # XXX: repeating code # meta-parsers for track and playlist items should be unified - title = ent[3] - artist = ent[4] - - track_id = f'{ent[1]}_{ent[0]}' + info = self._parse_track_meta(ent) + track_id = info.pop('id') + title = info.pop('title') audio_url = f'https://vk.com/audio{track_id}' entries.append(self.url_result( - audio_url, VKMusicIE, track_id, - join_nonempty(artist, title, delim=' - '), - track=title, artist=artist, uploader=artist, - duration=int_or_none(ent[5]), - # thumbnails=[meta[14]] - )) + audio_url, VKMusicIE, track_id, title, + **info)) artist = meta.get('authorName') thumbnail = meta.get('coverUrl') From 2e5faa1540e6e9fa4424985d4602035e60c2e8f2 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 21:10:40 +0400 Subject: [PATCH 14/51] feat: a bit better url regex --- yt_dlp/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 239411fc11..c57714ff5c 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -758,7 +758,7 @@ def _real_extract(self, url): class VKMusicIE(VKBaseIE): IE_NAME = 'vk:music' - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*\?z=audio_playlist|music/[a-z]+/)(?P-?\d+_\d+)(?:(?:%2F|_)(?P[0-9a-f]+))?)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*\?(?:act|z)=audio_playlist|music/[a-z]+/)(?P-?\d+_\d+)(?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))?)' _TESTS = [ { 'url': 'https://vk.com/audio-2001746599_34746599', From 84ef9661aa346d6bfb7c45b7ecc0e2ce799db4fd Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 22:19:36 +0400 Subject: [PATCH 15/51] feat: artist list, thumbnail, tests for them --- yt_dlp/extractor/vk.py | 45 +++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index c57714ff5c..397b8b0240 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -768,9 +768,26 @@ class VKMusicIE(VKBaseIE): 'title': 'Skillet - Feel Invincible', 'duration': 230, 'uploader': 'Skillet', - 'artist': 'Skillet', 'artists': ['Skillet'], 'track': 'Feel Invincible', + 'thumbnail': r're:https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'note': 'artists are in meta[17], 18th item contains empty string', + 'url': 'https://vk.com/audio-2001844083_29844083', + 'info_dict': { + 'id': '-2001844083_29844083', + 'ext': 'm4a', + 'title': 'Pusha T, Stormzy - Good Goodbye (feat. Pusha T and Stormzy)', + 'duration': 211, + 'uploader': 'Pusha T, Stormzy', + 'artists': ['Pusha T', 'Stormzy'], + 'track': 'Good Goodbye (feat. Pusha T and Stormzy)', + 'thumbnail': r're:https?://.*\.jpg', }, 'params': { 'skip_download': True, @@ -782,20 +799,24 @@ def _parse_track_meta(self, meta, track_id=None): len_ = len(meta) info = {} - info['id'] = track_id \ - if len_ < 2 or not meta[1] or not meta[0] \ - else f'{meta[1]}_{meta[0]}' + info['id'] = f'{meta[1]}_{meta[0]}' \ + if len_ >= 2 and meta[1] and meta[0] \ + else track_id title = meta[3] if len_ >= 3 else None - artist = meta[4] if len_ >= 4 else None + artist = meta[4] if len_ >= 4 else None # artists in one string, may include "feat." info['title'] = join_nonempty(artist, title, delim=' - ') - if title: - info['track'] = title - if artist: - info['artist'] = info['uploader'] = artist + info['track'] = title + info['uploader'] = artist + + # artists as list + info['artists'] = ( + traverse_obj((*meta[17], *meta[18]), ({dict}, 'name', ...)) + if len_ >= 18 else None + ) or [artist] info['duration'] = int_or_none(meta[5]) if len_ >= 5 else None - # info['thumbnail'] = meta[14] if len_ >= 14 else None + info['thumbnails'] = [{'url': meta[14]}] if len_ >= 14 else [] return info @@ -830,7 +851,6 @@ def _real_extract(self, url): **self._parse_track_meta(meta, track_id), 'formats': [{ 'url': url, - # XXX: copied from VKWallPostIE._real_extract 'ext': 'm4a', 'vcodec': 'none', 'acodec': 'mp3', @@ -859,9 +879,6 @@ def _real_extract(self, url): entries = [] for ent in tracks: - # XXX: repeating code - # meta-parsers for track and playlist items should be unified - info = self._parse_track_meta(ent) track_id = info.pop('id') title = info.pop('title') From c0fd87eb4b4d72fdda10038dd3a6b9a29cf3fedc Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 22:24:09 +0400 Subject: [PATCH 16/51] style: line breaks --- yt_dlp/extractor/vk.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 397b8b0240..41a4874444 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -832,7 +832,8 @@ def _real_extract(self, url): # copied regex from VKWallPostIE # XXX: common code should be unified, moved to a class data_audio = self._search_regex( - r'data-audio="([^"]+)', webpage, 'data-audio attr', group=1) + r'data-audio="([^"]+)', + webpage, 'data-audio attr', group=1) meta = self._parse_json(unescapeHTML(data_audio), track_id) one_more_id = meta[24] @@ -885,8 +886,7 @@ def _real_extract(self, url): audio_url = f'https://vk.com/audio{track_id}' entries.append(self.url_result( - audio_url, VKMusicIE, track_id, title, - **info)) + audio_url, VKMusicIE, track_id, title, **info)) artist = meta.get('authorName') thumbnail = meta.get('coverUrl') From 457479a845347fcc765b1d0eb4ba5c8142015e03 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 23:05:30 +0400 Subject: [PATCH 17/51] feat: albums --- yt_dlp/extractor/vk.py | 53 ++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 41a4874444..88a278e4f1 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -766,10 +766,10 @@ class VKMusicIE(VKBaseIE): 'id': '-2001746599_34746599', 'ext': 'm4a', 'title': 'Skillet - Feel Invincible', - 'duration': 230, + 'track': 'Feel Invincible', 'uploader': 'Skillet', 'artists': ['Skillet'], - 'track': 'Feel Invincible', + 'duration': 230, 'thumbnail': r're:https?://.*\.jpg', }, 'params': { @@ -783,16 +783,35 @@ class VKMusicIE(VKBaseIE): 'id': '-2001844083_29844083', 'ext': 'm4a', 'title': 'Pusha T, Stormzy - Good Goodbye (feat. Pusha T and Stormzy)', - 'duration': 211, + 'track': 'Good Goodbye (feat. Pusha T and Stormzy)', 'uploader': 'Pusha T, Stormzy', 'artists': ['Pusha T', 'Stormzy'], - 'track': 'Good Goodbye (feat. Pusha T and Stormzy)', + 'duration': 211, 'thumbnail': r're:https?://.*\.jpg', }, 'params': { 'skip_download': True, }, }, + { + 'url': 'https://vk.com/artist/linkinpark/releases?z=audio_playlist-2000984503_984503%2Fc468f3a862b6f73b55', + 'info_dict': { + 'id': '-2000984503_984503', + 'title': 'Linkin Park - One More Light', + 'album': 'One More Light', + 'uploader': 'Linkin Park', + 'artist': 'Linkin Park', + 'thumbnail': r're:https?://.*\.jpg', + 'genre': 'Альтернатива', + 'release_year': 2017, + 'modified_timestamp': int, + 'view_count': int, + }, + 'playlist_count': 10, + 'params': { + 'skip_download': True, + }, + }, ] def _parse_track_meta(self, meta, track_id=None): @@ -803,7 +822,7 @@ def _parse_track_meta(self, meta, track_id=None): if len_ >= 2 and meta[1] and meta[0] \ else track_id - title = meta[3] if len_ >= 3 else None + title = meta[3] if len_ >= 3 else None # TODO: fallback artist = meta[4] if len_ >= 4 else None # artists in one string, may include "feat." info['title'] = join_nonempty(artist, title, delim=' - ') info['track'] = title @@ -888,16 +907,26 @@ def _real_extract(self, url): entries.append(self.url_result( audio_url, VKMusicIE, track_id, title, **info)) + title = meta.get('title') # TODO: fallback artist = meta.get('authorName') - thumbnail = meta.get('coverUrl') + genre, year = self._search_regex( + r'^([^<]+)<\s*span[^>]*>[^<]*(\d+)$', + meta.get('infoLine1'), 'genre and release year', + default=(None, None), fatal=False, group=(1, 2)) + return self.playlist_result( - entries, playlist_id, - meta.get('title'), # TODO: maybe also "artist - title"? + entries, + playlist_id, + join_nonempty(artist, title, delim=' - '), meta.get('description'), - uploader=artist, artist=artist, - thumbnails=[thumbnail] if thumbnail else None, - # TODO: there are even more useful metadata - ) + album=title, + uploader=artist, + artists=[artist], + thumbnails=traverse_obj(meta, ({'url': 'coverUrl'}, ...)), + genres=[genre] if genre else [], + release_year=int_or_none(year), # XXX: is None ok here? + modified_timestamp=int_or_none(meta.get('lastUpdated')), + view_count=int_or_none(meta.get('listens'))) class VKPlayBaseIE(InfoExtractor): From 04c6819d99230f945964f7fbaea7fa9f437705d8 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 23:25:57 +0400 Subject: [PATCH 18/51] fix: album/playlist meta downloading --- yt_dlp/extractor/vk.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 88a278e4f1..2f91fa2a84 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -758,7 +758,7 @@ def _real_extract(self, url): class VKMusicIE(VKBaseIE): IE_NAME = 'vk:music' - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*\?(?:act|z)=audio_playlist|music/[a-z]+/)(?P-?\d+_\d+)(?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))?)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*\?(?:act|z)=audio_playlist|music/[a-z]+/)(?P(?P-?\d+)_(?P\d+))(?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))?)' _TESTS = [ { 'url': 'https://vk.com/audio-2001746599_34746599', @@ -843,7 +843,6 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) track_id = mobj.group('track_id') playlist_id = mobj.group('playlist_id') - access_hash = mobj.group('access_hash') or '' if track_id: webpage = self._download_webpage(url, track_id) @@ -879,25 +878,25 @@ def _real_extract(self, url): } elif playlist_id: - playlist = self._download_payload('al_audio', playlist_id, { + meta = self._download_payload('al_audio', playlist_id, { 'act': 'load_section', - 'access_hash': access_hash, + 'access_hash': mobj.group('access_hash') or '', 'claim': '0', 'context': '', 'from_id': self._parse_vk_id(), 'is_loading_all': '1', 'is_preload': '0', 'offset': '0', - 'owner_id': '', - 'playlist_id': '', + 'owner_id': mobj.group('pl_oid'), + 'playlist_id': mobj.group('pl_id'), 'ref': '', 'type': 'playlist', - }) + })[0] - meta = self._parse_json(playlist, playlist_id)[0] tracks = meta['list'] entries = [] + for ent in tracks: info = self._parse_track_meta(ent) track_id = info.pop('id') From cf21dc8b937ebb1072dbf1e4b6362409d45fa139 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 23:29:11 +0400 Subject: [PATCH 19/51] fix: playlist thumbnail --- yt_dlp/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 2f91fa2a84..7b118b0bd3 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -921,7 +921,7 @@ def _real_extract(self, url): album=title, uploader=artist, artists=[artist], - thumbnails=traverse_obj(meta, ({'url': 'coverUrl'}, ...)), + thumbnails=[traverse_obj(meta, {'url': 'coverUrl'})], genres=[genre] if genre else [], release_year=int_or_none(year), # XXX: is None ok here? modified_timestamp=int_or_none(meta.get('lastUpdated')), From 7a369b1bf67485dcaee25b08dfe802d2cbf8d8b7 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 23:34:41 +0400 Subject: [PATCH 20/51] fix: test --- yt_dlp/extractor/vk.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 7b118b0bd3..07383ad682 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -798,13 +798,15 @@ class VKMusicIE(VKBaseIE): 'info_dict': { 'id': '-2000984503_984503', 'title': 'Linkin Park - One More Light', + 'description': '', 'album': 'One More Light', 'uploader': 'Linkin Park', - 'artist': 'Linkin Park', + 'artists': ['Linkin Park'], 'thumbnail': r're:https?://.*\.jpg', - 'genre': 'Альтернатива', + 'genres': ['Alternative'], 'release_year': 2017, 'modified_timestamp': int, + 'modified_date': str, 'view_count': int, }, 'playlist_count': 10, @@ -921,7 +923,7 @@ def _real_extract(self, url): album=title, uploader=artist, artists=[artist], - thumbnails=[traverse_obj(meta, {'url': 'coverUrl'})], + thumbnail=meta.get('coverUrl'), # XXX: should i also specify `thumbnails`? genres=[genre] if genre else [], release_year=int_or_none(year), # XXX: is None ok here? modified_timestamp=int_or_none(meta.get('lastUpdated')), From a38798b479e999271ada332c40cf3ba6ba645d39 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 29 Mar 2025 23:42:23 +0400 Subject: [PATCH 21/51] fix: artist list --- yt_dlp/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 07383ad682..2eddbacd97 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -832,7 +832,7 @@ def _parse_track_meta(self, meta, track_id=None): # artists as list info['artists'] = ( - traverse_obj((*meta[17], *meta[18]), ({dict}, 'name', ...)) + traverse_obj((*meta[17], *meta[18]), (..., 'name')) if len_ >= 18 else None ) or [artist] From fb342c0a662f02f91f769df2ece83f40e8de6dfd Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sun, 30 Mar 2025 14:07:07 +0400 Subject: [PATCH 22/51] fix: tracks with meta in data-exec --- yt_dlp/extractor/vk.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 2eddbacd97..e29ad2b0de 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -14,7 +14,9 @@ UserNotLive, clean_html, get_element_by_class, + get_element_html_by_class, get_element_html_by_id, + extract_attributes, int_or_none, join_nonempty, parse_qs, @@ -793,6 +795,23 @@ class VKMusicIE(VKBaseIE): 'skip_download': True, }, }, + { + 'note': 'meta is AudioPlayerBlock__root[data-exec], no artists in 17/18', + 'url': 'https://vk.com/audio-26549346_456239443', + 'info_dict': { + 'id': '-26549346_456239443', + 'ext': 'm4a', + 'title': 'Fairie\'s Death Waltz - Still to Wake', + 'track': 'Still to Wake', + 'uploader': 'Fairie\'s Death Waltz', + 'artists': ['Fairie\'s Death Waltz'], + 'duration': 349, + 'thumbnail': r're:https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + } + }, { 'url': 'https://vk.com/artist/linkinpark/releases?z=audio_playlist-2000984503_984503%2Fc468f3a862b6f73b55', 'info_dict': { @@ -852,10 +871,19 @@ def _real_extract(self, url): # copied regex from VKWallPostIE # XXX: common code should be unified, moved to a class data_audio = self._search_regex( - r'data-audio="([^"]+)', - webpage, 'data-audio attr', group=1) + r'data-audio="([^"]+)', webpage, 'data-audio attr', + default=None, group=1) + + if data_audio: + meta = self._parse_json(unescapeHTML(data_audio), track_id) + else: + player = get_element_html_by_class('AudioPlayerBlock__root', webpage) + meta = traverse_obj( + self._parse_json( + extract_attributes(player).get('data-exec'), + track_id), + ('AudioPlayerBlock/init', 'firstAudio')) - meta = self._parse_json(unescapeHTML(data_audio), track_id) one_more_id = meta[24] del data_audio From 95db8d961066f980dac63f31e1f001585cba32fd Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sun, 30 Mar 2025 14:29:12 +0400 Subject: [PATCH 23/51] perf: simplify data-exec extraction (switch to regex) --- yt_dlp/extractor/vk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index e29ad2b0de..cdbd554ef4 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -877,11 +877,11 @@ def _real_extract(self, url): if data_audio: meta = self._parse_json(unescapeHTML(data_audio), track_id) else: - player = get_element_html_by_class('AudioPlayerBlock__root', webpage) + data_exec = self._search_regex( + r'class="AudioPlayerBlock__root"[^>]+data-exec="([^"]+)', + webpage, 'AudioPlayerBlock data-exec', group=1) meta = traverse_obj( - self._parse_json( - extract_attributes(player).get('data-exec'), - track_id), + self._parse_json(unescapeHTML(data_exec), track_id), ('AudioPlayerBlock/init', 'firstAudio')) one_more_id = meta[24] From ea0ecd3f3ce4cfffe3bc02a1498f851988d88570 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sun, 30 Mar 2025 14:32:19 +0400 Subject: [PATCH 24/51] test: skip requiring auth --- yt_dlp/extractor/vk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index cdbd554ef4..09b9a80b39 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -797,6 +797,7 @@ class VKMusicIE(VKBaseIE): }, { 'note': 'meta is AudioPlayerBlock__root[data-exec], no artists in 17/18', + 'skip': 'authentication required', 'url': 'https://vk.com/audio-26549346_456239443', 'info_dict': { 'id': '-26549346_456239443', From 40b039df2cb504ae29963b6f733890f54bda4b18 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sun, 30 Mar 2025 15:17:59 +0400 Subject: [PATCH 25/51] fix: html escaping in response json --- yt_dlp/extractor/vk.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 09b9a80b39..216a0c29d5 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -834,6 +834,23 @@ class VKMusicIE(VKBaseIE): 'skip_download': True, }, }, + { + 'note': 'special symbols in title and artist must be unescaped', + 'url': 'https://vk.com/audio-2001069891_6069891', + 'info_dict': { + 'id': '-2001069891_6069891', + 'ext': 'm4a', + 'title': 'Jack Thomas feat. Nico & Vinz - Rivers (feat. Nico & Vinz)', + 'track': 'Rivers (feat. Nico & Vinz)', + 'uploader': 'Jack Thomas feat. Nico & Vinz', + 'artists': ['Jack Thomas', 'Nico & Vinz'], + 'duration': 207, + 'thumbnail': r're:https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + } + } ] def _parse_track_meta(self, meta, track_id=None): @@ -844,14 +861,15 @@ def _parse_track_meta(self, meta, track_id=None): if len_ >= 2 and meta[1] and meta[0] \ else track_id - title = meta[3] if len_ >= 3 else None # TODO: fallback - artist = meta[4] if len_ >= 4 else None # artists in one string, may include "feat." + title = unescapeHTML(meta[3]) if len_ >= 3 else None # TODO: fallback + artist = unescapeHTML(meta[4]) if len_ >= 4 else None # artists in one string, may include "feat." info['title'] = join_nonempty(artist, title, delim=' - ') info['track'] = title info['uploader'] = artist # artists as list info['artists'] = ( + # not htmlescaped unlike meta[4] traverse_obj((*meta[17], *meta[18]), (..., 'name')) if len_ >= 18 else None ) or [artist] From 90a026e648325f67665cb4f731a31b73a7a88600 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sun, 30 Mar 2025 15:41:45 +0400 Subject: [PATCH 26/51] fix(playlist): description, htmlescape, url regex, +test --- yt_dlp/extractor/vk.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 216a0c29d5..2708da6495 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -760,7 +760,7 @@ def _real_extract(self, url): class VKMusicIE(VKBaseIE): IE_NAME = 'vk:music' - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*\?(?:act|z)=audio_playlist|music/[a-z]+/)(?P(?P-?\d+)_(?P\d+))(?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))?)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*[\?&](?:act|z)=audio_playlist|music/[a-z]+/)(?P(?P-?\d+)_(?P\d+))(?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))?)' _TESTS = [ { 'url': 'https://vk.com/audio-2001746599_34746599', @@ -850,6 +850,25 @@ class VKMusicIE(VKBaseIE): 'params': { 'skip_download': True, } + }, + { + 'url': 'https://vk.com/audios877252112?block=playlists§ion=general&z=audio_playlist-147845620_2390', + 'info_dict': { + 'id': '-147845620_2390', + 'title': 'VK Музыка - VK Fest 2024: Белая сцена', + 'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0', + 'album': 'VK Fest 2024: Белая сцена', # XXX: not an album (but who cares actually) + 'uploader': 'VK Музыка', + 'artists': ['VK Музыка'], # XXX: not actually a list of all artists + 'thumbnail': r're:https?://.*\.jpg', + 'modified_timestamp': int, + 'modified_date': str, + 'view_count': int, + }, + 'playlist_count': 18, + 'params': { + 'skip_download': True, + } } ] @@ -955,8 +974,8 @@ def _real_extract(self, url): entries.append(self.url_result( audio_url, VKMusicIE, track_id, title, **info)) - title = meta.get('title') # TODO: fallback - artist = meta.get('authorName') + title = unescapeHTML(meta.get('title')) # TODO: fallback + artist = unescapeHTML(meta.get('authorName')) genre, year = self._search_regex( r'^([^<]+)<\s*span[^>]*>[^<]*(\d+)$', meta.get('infoLine1'), 'genre and release year', @@ -966,13 +985,13 @@ def _real_extract(self, url): entries, playlist_id, join_nonempty(artist, title, delim=' - '), - meta.get('description'), + unescapeHTML(meta.get('rawDescription')), album=title, uploader=artist, artists=[artist], thumbnail=meta.get('coverUrl'), # XXX: should i also specify `thumbnails`? - genres=[genre] if genre else [], - release_year=int_or_none(year), # XXX: is None ok here? + genres=[unescapeHTML(genre)] if genre else None, + release_year=int_or_none(year), modified_timestamp=int_or_none(meta.get('lastUpdated')), view_count=int_or_none(meta.get('listens'))) From 8678b341168432d02388fb244adeee6afa93aa2f Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sun, 30 Mar 2025 15:55:27 +0400 Subject: [PATCH 27/51] feat: parse age limit --- yt_dlp/extractor/vk.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 2708da6495..c35d123536 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -896,6 +896,17 @@ def _parse_track_meta(self, meta, track_id=None): info['duration'] = int_or_none(meta[5]) if len_ >= 5 else None info['thumbnails'] = [{'url': meta[14]}] if len_ >= 14 else [] + # meta[30] is 2 bits + # most significant: isExplicit + # least significant: isForeignAgent + # i. e. + # 00 = safe + # 01 = marked by RKN as "foreign agent" + # 10 = explicit lyrics + # 11 = both E lyrics and "foreign agent" + if len_ >= 30 and meta[30]: + info['age_limit'] = 18 + return info def _real_extract(self, url): From 932ebf43df43fdb5db242498319e2ed0671ed451 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sun, 30 Mar 2025 15:56:20 +0400 Subject: [PATCH 28/51] style: hatch fmt --- yt_dlp/extractor/vk.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index c35d123536..35a0042d3c 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -14,9 +14,7 @@ UserNotLive, clean_html, get_element_by_class, - get_element_html_by_class, get_element_html_by_id, - extract_attributes, int_or_none, join_nonempty, parse_qs, @@ -811,7 +809,7 @@ class VKMusicIE(VKBaseIE): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'https://vk.com/artist/linkinpark/releases?z=audio_playlist-2000984503_984503%2Fc468f3a862b6f73b55', @@ -849,7 +847,7 @@ class VKMusicIE(VKBaseIE): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'https://vk.com/audios877252112?block=playlists§ion=general&z=audio_playlist-147845620_2390', @@ -868,8 +866,8 @@ class VKMusicIE(VKBaseIE): 'playlist_count': 18, 'params': { 'skip_download': True, - } - } + }, + }, ] def _parse_track_meta(self, meta, track_id=None): From 013cf4368f1bdc941fdd84e0d86cc6c6a78c1a5b Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sun, 30 Mar 2025 16:29:11 +0400 Subject: [PATCH 29/51] feat: album&artist only for albums, pass title as is --- yt_dlp/extractor/vk.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 35a0042d3c..ff8c962cd9 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -815,7 +815,7 @@ class VKMusicIE(VKBaseIE): 'url': 'https://vk.com/artist/linkinpark/releases?z=audio_playlist-2000984503_984503%2Fc468f3a862b6f73b55', 'info_dict': { 'id': '-2000984503_984503', - 'title': 'Linkin Park - One More Light', + 'title': 'One More Light', 'description': '', 'album': 'One More Light', 'uploader': 'Linkin Park', @@ -853,11 +853,9 @@ class VKMusicIE(VKBaseIE): 'url': 'https://vk.com/audios877252112?block=playlists§ion=general&z=audio_playlist-147845620_2390', 'info_dict': { 'id': '-147845620_2390', - 'title': 'VK Музыка - VK Fest 2024: Белая сцена', + 'title': 'VK Fest 2024: Белая сцена', 'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0', - 'album': 'VK Fest 2024: Белая сцена', # XXX: not an album (but who cares actually) 'uploader': 'VK Музыка', - 'artists': ['VK Музыка'], # XXX: not actually a list of all artists 'thumbnail': r're:https?://.*\.jpg', 'modified_timestamp': int, 'modified_date': str, @@ -989,15 +987,14 @@ def _real_extract(self, url): r'^([^<]+)<\s*span[^>]*>[^<]*(\d+)$', meta.get('infoLine1'), 'genre and release year', default=(None, None), fatal=False, group=(1, 2)) + is_album = year is not None return self.playlist_result( - entries, - playlist_id, - join_nonempty(artist, title, delim=' - '), + entries, playlist_id, title, unescapeHTML(meta.get('rawDescription')), - album=title, + album=title if is_album else None, uploader=artist, - artists=[artist], + artists=[artist] if is_album else None, thumbnail=meta.get('coverUrl'), # XXX: should i also specify `thumbnails`? genres=[unescapeHTML(genre)] if genre else None, release_year=int_or_none(year), From cab6eeeae27f27511c72e098cc8dfb6d5ad81f81 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Tue, 1 Apr 2025 15:56:51 +0400 Subject: [PATCH 30/51] feat: "artist - album" but only for albums --- yt_dlp/extractor/vk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index ff8c962cd9..4ca2476b73 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -815,7 +815,7 @@ class VKMusicIE(VKBaseIE): 'url': 'https://vk.com/artist/linkinpark/releases?z=audio_playlist-2000984503_984503%2Fc468f3a862b6f73b55', 'info_dict': { 'id': '-2000984503_984503', - 'title': 'One More Light', + 'title': 'Linkin Park - One More Light', 'description': '', 'album': 'One More Light', 'uploader': 'Linkin Park', @@ -990,7 +990,8 @@ def _real_extract(self, url): is_album = year is not None return self.playlist_result( - entries, playlist_id, title, + entries, playlist_id, + join_nonempty(artist, title, delim=' - ') if is_album else title, unescapeHTML(meta.get('rawDescription')), album=title if is_album else None, uploader=artist, From c71008ed60b370b5827a3a5ebe0514d6dce0593d Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Tue, 1 Apr 2025 16:44:30 +0400 Subject: [PATCH 31/51] feat: correct errs when auth required or geoblocked --- yt_dlp/extractor/vk.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 4ca2476b73..8771f74300 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -922,15 +922,28 @@ def _real_extract(self, url): if data_audio: meta = self._parse_json(unescapeHTML(data_audio), track_id) else: + if self._parse_vk_id() == 0: + self.raise_login_required( + 'This track is unavailable. ' + 'Log in or provide a link with access hash') + data_exec = self._search_regex( r'class="AudioPlayerBlock__root"[^>]+data-exec="([^"]+)', webpage, 'AudioPlayerBlock data-exec', group=1) + meta = traverse_obj( self._parse_json(unescapeHTML(data_exec), track_id), ('AudioPlayerBlock/init', 'firstAudio')) one_more_id = meta[24] + block_reason = traverse_obj( + self._parse_json(meta[12], track_id, fatal=False), + ('claim', 'reason')) + + if block_reason == 'geo': + self.raise_geo_restricted() + del data_audio del webpage From df89f7643dddd9c3631f0b2690b541f288e45787 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Tue, 1 Apr 2025 16:55:50 +0400 Subject: [PATCH 32/51] feat: parse access_hash from url for tracks too --- yt_dlp/extractor/vk.py | 65 +++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 8771f74300..3b49bef814 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -758,7 +758,10 @@ def _real_extract(self, url): class VKMusicIE(VKBaseIE): IE_NAME = 'vk:music' - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*[\?&](?:act|z)=audio_playlist|music/[a-z]+/)(?P(?P-?\d+)_(?P\d+))(?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))?)' + + # Debug and test on https://regexr.com/8dlot + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*[\?&](?:act|z)=audio_playlist|music/[a-z]+/)(?P(?P-?\d+)_(?P\d+)))(?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))?' + _TESTS = [ { 'url': 'https://vk.com/audio-2001746599_34746599', @@ -909,47 +912,49 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) track_id = mobj.group('track_id') playlist_id = mobj.group('playlist_id') + access_hash = mobj.group('access_hash') if track_id: - webpage = self._download_webpage(url, track_id) + if not access_hash: + webpage = self._download_webpage(url, track_id) - # copied regex from VKWallPostIE - # XXX: common code should be unified, moved to a class - data_audio = self._search_regex( - r'data-audio="([^"]+)', webpage, 'data-audio attr', - default=None, group=1) + data_audio = self._search_regex( + r'data-audio="([^"]+)', webpage, 'data-audio attr', + default=None, group=1) - if data_audio: - meta = self._parse_json(unescapeHTML(data_audio), track_id) - else: - if self._parse_vk_id() == 0: - self.raise_login_required( - 'This track is unavailable. ' - 'Log in or provide a link with access hash') + if data_audio: + meta = self._parse_json(unescapeHTML(data_audio), track_id) + else: + if self._parse_vk_id() == 0: + self.raise_login_required( + 'This track is unavailable. ' + 'Log in or provide a link with access hash') - data_exec = self._search_regex( - r'class="AudioPlayerBlock__root"[^>]+data-exec="([^"]+)', - webpage, 'AudioPlayerBlock data-exec', group=1) + data_exec = self._search_regex( + r'class="AudioPlayerBlock__root"[^>]+data-exec="([^"]+)', + webpage, 'AudioPlayerBlock data-exec', group=1) - meta = traverse_obj( - self._parse_json(unescapeHTML(data_exec), track_id), - ('AudioPlayerBlock/init', 'firstAudio')) + meta = traverse_obj( + self._parse_json(unescapeHTML(data_exec), track_id), + ('AudioPlayerBlock/init', 'firstAudio')) - one_more_id = meta[24] + del data_exec - block_reason = traverse_obj( - self._parse_json(meta[12], track_id, fatal=False), - ('claim', 'reason')) + del data_audio + del webpage - if block_reason == 'geo': - self.raise_geo_restricted() + access_hash = meta[24] - del data_audio - del webpage + block_reason = traverse_obj( + self._parse_json(meta[12], track_id, fatal=False), + ('claim', 'reason')) + + if block_reason == 'geo': + self.raise_geo_restricted() meta = self._download_payload('al_audio', track_id, { 'act': 'reload_audios', - 'audio_ids': f'{track_id}_{one_more_id}', + 'audio_ids': f'{track_id}_{access_hash}', })[0][0] url = _unmask_url(meta[2], self._parse_vk_id()) @@ -968,7 +973,7 @@ def _real_extract(self, url): elif playlist_id: meta = self._download_payload('al_audio', playlist_id, { 'act': 'load_section', - 'access_hash': mobj.group('access_hash') or '', + 'access_hash': access_hash or '', 'claim': '0', 'context': '', 'from_id': self._parse_vk_id(), From af005abdb3d833c53444d6899d3b9aeeedd8204a Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Tue, 1 Apr 2025 17:05:09 +0400 Subject: [PATCH 33/51] feat: add access hash when parsing playlist entries --- yt_dlp/extractor/vk.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 3b49bef814..0782c501c6 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -992,9 +992,10 @@ def _real_extract(self, url): for ent in tracks: info = self._parse_track_meta(ent) + ent_access = f'_{ent[24]}' if len(ent) >= 24 and ent[24] else '' track_id = info.pop('id') title = info.pop('title') - audio_url = f'https://vk.com/audio{track_id}' + audio_url = f'https://vk.com/audio{track_id}{ent_access}' entries.append(self.url_result( audio_url, VKMusicIE, track_id, title, **info)) From 705bb0ba5f29992e970095012de7beeed5f259e0 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Tue, 1 Apr 2025 21:48:01 +0400 Subject: [PATCH 34/51] feat: better unavailability check --- yt_dlp/extractor/vk.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 0782c501c6..17cdcab4cb 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -908,6 +908,21 @@ def _parse_track_meta(self, meta, track_id=None): return info + def _raise_if_blocked(self, meta, track_id): + reason = traverse_obj( + self._parse_json( + meta[12] if len(meta) >= 12 else None, + track_id, fatal=False), + ('claim', 'reason')) + + if reason == 'geo': + self.raise_geo_restricted() + # can be an empty string + elif reason is not None: + raise ExtractorError( + 'This track is unavailable. ' + f'Reason code: {reason:r}') + def _real_extract(self, url): mobj = self._match_valid_url(url) track_id = mobj.group('track_id') @@ -943,19 +958,22 @@ def _real_extract(self, url): del data_audio del webpage + self._raise_if_blocked(meta, track_id) + access_hash = meta[24] - block_reason = traverse_obj( - self._parse_json(meta[12], track_id, fatal=False), - ('claim', 'reason')) - - if block_reason == 'geo': - self.raise_geo_restricted() - meta = self._download_payload('al_audio', track_id, { 'act': 'reload_audios', 'audio_ids': f'{track_id}_{access_hash}', - })[0][0] + })[0] + + # vk sends an empty list when auth required + if not meta: + self.raise_login_required() + + meta = meta[0] + + self._raise_if_blocked(meta, track_id) url = _unmask_url(meta[2], self._parse_vk_id()) From cd282aae39aaf8c6e62113a87c335f2701a8db11 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 16:56:46 +0400 Subject: [PATCH 35/51] refactor: split into base class, track ie, playlist ie --- yt_dlp/extractor/vk.py | 451 +++++++++++++++++++++-------------------- 1 file changed, 235 insertions(+), 216 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 17cdcab4cb..3831f04260 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -756,11 +756,102 @@ def _real_extract(self, url): clean_html(get_element_by_class('wall_post_text', webpage))) -class VKMusicIE(VKBaseIE): - IE_NAME = 'vk:music' +class VKMusicBaseIE(VKBaseIE): + _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' - # Debug and test on https://regexr.com/8dlot - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/(?:audio(?P-?\d+_\d+)|(?:.*[\?&](?:act|z)=audio_playlist|music/[a-z]+/)(?P(?P-?\d+)_(?P\d+)))(?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))?' + def _b64_decode(self, enc): + dec = '' + e = n = 0 + for c in enc: + r = self._BASE64_CHARS.index(c) + cond = n % 4 + e = 64 * e + r if cond else r + n += 1 + if cond: + dec += chr(255 & e >> (-2 * n & 6)) + return dec + + def _unmask_url(self, mask_url, vk_id): + if 'audio_api_unavailable' in mask_url: + extra = mask_url.split('?extra=')[1].split('#') + _, base = self._b64_decode(extra[1]).split(chr(11)) + mask_url = list(self._b64_decode(extra[0])) + url_len = len(mask_url) + indexes = [None] * url_len + index = int(base) ^ vk_id + for n in range(url_len - 1, -1, -1): + index = (url_len * (n + 1) ^ index + n) % url_len + indexes[n] = index + for n in range(1, url_len): + c = mask_url[n] + index = indexes[url_len - 1 - n] + mask_url[n] = mask_url[index] + mask_url[index] = c + mask_url = ''.join(mask_url) + return mask_url + + def _parse_track_meta(self, meta, track_id=None): + len_ = len(meta) + info = {} + + info['id'] = f'{meta[1]}_{meta[0]}' \ + if len_ >= 2 and meta[1] and meta[0] \ + else track_id + + title = unescapeHTML(meta[3]) if len_ >= 3 else None # TODO: fallback + artist = unescapeHTML(meta[4]) if len_ >= 4 else None # artists in one string, may include "feat." + info['title'] = join_nonempty(artist, title, delim=' - ') + info['track'] = title + info['uploader'] = artist + + # artists as list + info['artists'] = ( + # not htmlescaped unlike meta[4] + traverse_obj((*meta[17], *meta[18]), (..., 'name')) + if len_ >= 18 else None + ) or [artist] + + info['duration'] = int_or_none(meta[5]) if len_ >= 5 else None + info['thumbnails'] = [{'url': meta[14]}] if len_ >= 14 else [] + + # meta[30] is 2 bits + # most significant: isExplicit + # least significant: isForeignAgent + # i. e. + # 00 = safe + # 01 = marked by RKN as "foreign agent" + # 10 = explicit lyrics + # 11 = both E lyrics and "foreign agent" + if len_ >= 30 and meta[30]: + info['age_limit'] = 18 + + return info + + def _raise_if_blocked(self, meta, track_id): + reason = traverse_obj( + self._parse_json( + meta[12] if len(meta) >= 12 else None, + track_id, fatal=False), + ('claim', 'reason')) + + if reason == 'geo': + self.raise_geo_restricted() + # can be an empty string + elif reason is not None: + raise ExtractorError( + 'This track is unavailable. ' + f'Reason code: {reason:r}') + + +class VKMusicTrackIE(VKMusicBaseIE): + IE_NAME = 'vkmusic:track' + + _VALID_URL = r'''(?x) + https?:// + (?:(?:m|new)\.)?vk\.(?:com|ru)/ + audio(?P-?\d+_\d+) + (?:(?:%2F|_)(?P[0-9a-f]+))? + ''' _TESTS = [ { @@ -814,6 +905,102 @@ class VKMusicIE(VKBaseIE): 'skip_download': True, }, }, + { + 'note': 'special symbols in title and artist must be unescaped', + 'url': 'https://vk.com/audio-2001069891_6069891', + 'info_dict': { + 'id': '-2001069891_6069891', + 'ext': 'm4a', + 'title': 'Jack Thomas feat. Nico & Vinz - Rivers (feat. Nico & Vinz)', + 'track': 'Rivers (feat. Nico & Vinz)', + 'uploader': 'Jack Thomas feat. Nico & Vinz', + 'artists': ['Jack Thomas', 'Nico & Vinz'], + 'duration': 207, + 'thumbnail': r're:https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + track_id = mobj.group('id') + access_hash = mobj.group('hash') + + if not access_hash: + webpage = self._download_webpage(url, track_id) + + data_audio = self._search_regex( + r'data-audio="([^"]+)', webpage, 'data-audio attr', + default=None, group=1) + + if data_audio: + meta = self._parse_json(unescapeHTML(data_audio), track_id) + else: + if self._parse_vk_id() == 0: + self.raise_login_required( + 'This track is unavailable. ' + 'Log in or provide a link with access hash') + + data_exec = self._search_regex( + r'class="AudioPlayerBlock__root"[^>]+data-exec="([^"]+)', + webpage, 'AudioPlayerBlock data-exec', group=1) + + meta = traverse_obj( + self._parse_json(unescapeHTML(data_exec), track_id), + ('AudioPlayerBlock/init', 'firstAudio')) + + del data_exec + + del data_audio + del webpage + + self._raise_if_blocked(meta, track_id) + + access_hash = meta[24] + + meta = self._download_payload('al_audio', track_id, { + 'act': 'reload_audios', + 'audio_ids': f'{track_id}_{access_hash}', + })[0] + + # vk sends an empty list when auth required + if not meta: + self.raise_login_required() + + meta = meta[0] + self._raise_if_blocked(meta, track_id) + url = self._unmask_url(meta[2], self._parse_vk_id()) + + return { + **self._parse_track_meta(meta, track_id), + 'formats': [{ + 'url': url, + 'ext': 'm4a', + 'vcodec': 'none', + 'acodec': 'mp3', + 'container': 'm4a_dash', + }], + } + + +class VKMusicPlaylistIE(VKMusicBaseIE): + IE_NAME = 'vkmusic:playlist' + + _VALID_URL = r'''(?x) + https?:// + (?:(?:m|new)\.)?vk\.(?:com|ru)/ + (?: + music/(?:album|playlist)/| + .*[?&](?:act|z)=audio_playlist + ) + (?P(?P-?\d+)_(?P\d+)) + (?:(?:%2F|_|[?&]access_hash=)(?P[0-9a-f]+))? + ''' + + _TESTS = [ { 'url': 'https://vk.com/artist/linkinpark/releases?z=audio_playlist-2000984503_984503%2Fc468f3a862b6f73b55', 'info_dict': { @@ -835,23 +1022,6 @@ class VKMusicIE(VKBaseIE): 'skip_download': True, }, }, - { - 'note': 'special symbols in title and artist must be unescaped', - 'url': 'https://vk.com/audio-2001069891_6069891', - 'info_dict': { - 'id': '-2001069891_6069891', - 'ext': 'm4a', - 'title': 'Jack Thomas feat. Nico & Vinz - Rivers (feat. Nico & Vinz)', - 'track': 'Rivers (feat. Nico & Vinz)', - 'uploader': 'Jack Thomas feat. Nico & Vinz', - 'artists': ['Jack Thomas', 'Nico & Vinz'], - 'duration': 207, - 'thumbnail': r're:https?://.*\.jpg', - }, - 'params': { - 'skip_download': True, - }, - }, { 'url': 'https://vk.com/audios877252112?block=playlists§ion=general&z=audio_playlist-147845620_2390', 'info_dict': { @@ -871,173 +1041,58 @@ class VKMusicIE(VKBaseIE): }, ] - def _parse_track_meta(self, meta, track_id=None): - len_ = len(meta) - info = {} - - info['id'] = f'{meta[1]}_{meta[0]}' \ - if len_ >= 2 and meta[1] and meta[0] \ - else track_id - - title = unescapeHTML(meta[3]) if len_ >= 3 else None # TODO: fallback - artist = unescapeHTML(meta[4]) if len_ >= 4 else None # artists in one string, may include "feat." - info['title'] = join_nonempty(artist, title, delim=' - ') - info['track'] = title - info['uploader'] = artist - - # artists as list - info['artists'] = ( - # not htmlescaped unlike meta[4] - traverse_obj((*meta[17], *meta[18]), (..., 'name')) - if len_ >= 18 else None - ) or [artist] - - info['duration'] = int_or_none(meta[5]) if len_ >= 5 else None - info['thumbnails'] = [{'url': meta[14]}] if len_ >= 14 else [] - - # meta[30] is 2 bits - # most significant: isExplicit - # least significant: isForeignAgent - # i. e. - # 00 = safe - # 01 = marked by RKN as "foreign agent" - # 10 = explicit lyrics - # 11 = both E lyrics and "foreign agent" - if len_ >= 30 and meta[30]: - info['age_limit'] = 18 - - return info - - def _raise_if_blocked(self, meta, track_id): - reason = traverse_obj( - self._parse_json( - meta[12] if len(meta) >= 12 else None, - track_id, fatal=False), - ('claim', 'reason')) - - if reason == 'geo': - self.raise_geo_restricted() - # can be an empty string - elif reason is not None: - raise ExtractorError( - 'This track is unavailable. ' - f'Reason code: {reason:r}') - def _real_extract(self, url): mobj = self._match_valid_url(url) - track_id = mobj.group('track_id') - playlist_id = mobj.group('playlist_id') - access_hash = mobj.group('access_hash') + playlist_id = mobj.group('full_id') - if track_id: - if not access_hash: - webpage = self._download_webpage(url, track_id) + meta = self._download_payload('al_audio', playlist_id, { + 'act': 'load_section', + 'access_hash': mobj.group('hash') or '', + 'claim': '0', + 'context': '', + 'from_id': self._parse_vk_id(), + 'is_loading_all': '1', + 'is_preload': '0', + 'offset': '0', + 'owner_id': mobj.group('oid'), + 'playlist_id': mobj.group('id'), + 'ref': '', + 'type': 'playlist', + })[0] + tracks = meta['list'] - data_audio = self._search_regex( - r'data-audio="([^"]+)', webpage, 'data-audio attr', - default=None, group=1) + entries = [] + for ent in tracks: + info = self._parse_track_meta(ent) + track_id = info.pop('id') + title = info.pop('title') - if data_audio: - meta = self._parse_json(unescapeHTML(data_audio), track_id) - else: - if self._parse_vk_id() == 0: - self.raise_login_required( - 'This track is unavailable. ' - 'Log in or provide a link with access hash') + ent_hash = f'_{ent[24]}' if len(ent) >= 24 and ent[24] else '' + audio_url = f'https://vk.com/audio{track_id}{ent_hash}' - data_exec = self._search_regex( - r'class="AudioPlayerBlock__root"[^>]+data-exec="([^"]+)', - webpage, 'AudioPlayerBlock data-exec', group=1) + entries.append(self.url_result( + audio_url, VKMusicTrackIE, track_id, title, **info)) - meta = traverse_obj( - self._parse_json(unescapeHTML(data_exec), track_id), - ('AudioPlayerBlock/init', 'firstAudio')) + title = unescapeHTML(meta.get('title')) # TODO: fallback + artist = unescapeHTML(meta.get('authorName')) + genre, year = self._search_regex( + r'^([^<]+)<\s*span[^>]*>[^<]*(\d+)$', + meta.get('infoLine1'), 'genre and release year', + default=(None, None), fatal=False, group=(1, 2)) + is_album = year is not None - del data_exec - - del data_audio - del webpage - - self._raise_if_blocked(meta, track_id) - - access_hash = meta[24] - - meta = self._download_payload('al_audio', track_id, { - 'act': 'reload_audios', - 'audio_ids': f'{track_id}_{access_hash}', - })[0] - - # vk sends an empty list when auth required - if not meta: - self.raise_login_required() - - meta = meta[0] - - self._raise_if_blocked(meta, track_id) - - url = _unmask_url(meta[2], self._parse_vk_id()) - - return { - **self._parse_track_meta(meta, track_id), - 'formats': [{ - 'url': url, - 'ext': 'm4a', - 'vcodec': 'none', - 'acodec': 'mp3', - 'container': 'm4a_dash', - }], - } - - elif playlist_id: - meta = self._download_payload('al_audio', playlist_id, { - 'act': 'load_section', - 'access_hash': access_hash or '', - 'claim': '0', - 'context': '', - 'from_id': self._parse_vk_id(), - 'is_loading_all': '1', - 'is_preload': '0', - 'offset': '0', - 'owner_id': mobj.group('pl_oid'), - 'playlist_id': mobj.group('pl_id'), - 'ref': '', - 'type': 'playlist', - })[0] - - tracks = meta['list'] - - entries = [] - - for ent in tracks: - info = self._parse_track_meta(ent) - ent_access = f'_{ent[24]}' if len(ent) >= 24 and ent[24] else '' - track_id = info.pop('id') - title = info.pop('title') - audio_url = f'https://vk.com/audio{track_id}{ent_access}' - - entries.append(self.url_result( - audio_url, VKMusicIE, track_id, title, **info)) - - title = unescapeHTML(meta.get('title')) # TODO: fallback - artist = unescapeHTML(meta.get('authorName')) - genre, year = self._search_regex( - r'^([^<]+)<\s*span[^>]*>[^<]*(\d+)$', - meta.get('infoLine1'), 'genre and release year', - default=(None, None), fatal=False, group=(1, 2)) - is_album = year is not None - - return self.playlist_result( - entries, playlist_id, - join_nonempty(artist, title, delim=' - ') if is_album else title, - unescapeHTML(meta.get('rawDescription')), - album=title if is_album else None, - uploader=artist, - artists=[artist] if is_album else None, - thumbnail=meta.get('coverUrl'), # XXX: should i also specify `thumbnails`? - genres=[unescapeHTML(genre)] if genre else None, - release_year=int_or_none(year), - modified_timestamp=int_or_none(meta.get('lastUpdated')), - view_count=int_or_none(meta.get('listens'))) + return self.playlist_result( + entries, playlist_id, + join_nonempty(artist, title, delim=' - ') if is_album else title, + unescapeHTML(meta.get('rawDescription')), + album=title if is_album else None, + uploader=artist, + artists=[artist] if is_album else None, + thumbnail=meta.get('coverUrl'), # XXX: should i also specify `thumbnails`? + genres=[unescapeHTML(genre)] if genre else None, + release_year=int_or_none(year), + modified_timestamp=int_or_none(meta.get('lastUpdated')), + view_count=int_or_none(meta.get('listens'))) class VKPlayBaseIE(InfoExtractor): @@ -1187,39 +1242,3 @@ def _real_extract(self, url): **self._extract_common_meta(stream_info), 'formats': formats, } - - -_BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' - - -def _b64_decode(enc): - dec = '' - e = n = 0 - for c in enc: - r = _BASE64_CHARS.index(c) - cond = n % 4 - e = 64 * e + r if cond else r - n += 1 - if cond: - dec += chr(255 & e >> (-2 * n & 6)) - return dec - - -def _unmask_url(mask_url, vk_id): - if 'audio_api_unavailable' in mask_url: - extra = mask_url.split('?extra=')[1].split('#') - func, base = _b64_decode(extra[1]).split(chr(11)) - mask_url = list(_b64_decode(extra[0])) - url_len = len(mask_url) - indexes = [None] * url_len - index = int(base) ^ vk_id - for n in range(url_len - 1, -1, -1): - index = (url_len * (n + 1) ^ index + n) % url_len - indexes[n] = index - for n in range(1, url_len): - c = mask_url[n] - index = indexes[url_len - 1 - n] - mask_url[n] = mask_url[index] - mask_url[index] = c - mask_url = ''.join(mask_url) - return mask_url From 27f0964061e83b8899caa0650fa0c21cae294298 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 17:02:07 +0400 Subject: [PATCH 36/51] fix: ie list --- yt_dlp/extractor/_extractors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3c9299a922..6db04c98e2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2382,7 +2382,8 @@ ) from .vk import ( VKIE, - VKMusicIE, + VKMusicPlaylistIE, + VKMusicTrackIE, VKPlayIE, VKPlayLiveIE, VKUserVideosIE, From 5f30070405f4c59a26556571fe621b09dbd0b7d3 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 17:22:06 +0400 Subject: [PATCH 37/51] fix: playlist thumbnails --- yt_dlp/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 3831f04260..a61d3b4761 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -1088,7 +1088,7 @@ def _real_extract(self, url): album=title if is_album else None, uploader=artist, artists=[artist] if is_album else None, - thumbnail=meta.get('coverUrl'), # XXX: should i also specify `thumbnails`? + thumbnails=traverse_obj(meta, ({'url': 'coverUrl'}, {lambda obj: [obj]})), genres=[unescapeHTML(genre)] if genre else None, release_year=int_or_none(year), modified_timestamp=int_or_none(meta.get('lastUpdated')), From 45ca66f40d2bd0aa8c8a3aeecf59bc8b83147320 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 17:54:57 +0400 Subject: [PATCH 38/51] style: return info dict instead of dynamically filling --- yt_dlp/extractor/vk.py | 55 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index a61d3b4761..4f16797371 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -792,40 +792,39 @@ def _unmask_url(self, mask_url, vk_id): def _parse_track_meta(self, meta, track_id=None): len_ = len(meta) - info = {} - info['id'] = f'{meta[1]}_{meta[0]}' \ - if len_ >= 2 and meta[1] and meta[0] \ - else track_id + # track title + title = unescapeHTML(meta[3]) if len_ >= 3 else None + # artists in one string, may include "feat." + artist = unescapeHTML(meta[4]) if len_ >= 4 else None - title = unescapeHTML(meta[3]) if len_ >= 3 else None # TODO: fallback - artist = unescapeHTML(meta[4]) if len_ >= 4 else None # artists in one string, may include "feat." - info['title'] = join_nonempty(artist, title, delim=' - ') - info['track'] = title - info['uploader'] = artist + return { + 'id': (f'{meta[1]}_{meta[0]}' + if len_ >= 2 and meta[1] and meta[0] + else track_id), - # artists as list - info['artists'] = ( - # not htmlescaped unlike meta[4] - traverse_obj((*meta[17], *meta[18]), (..., 'name')) - if len_ >= 18 else None - ) or [artist] + 'title': join_nonempty(artist, title, delim=' - '), + 'track': title, + 'uploader': artist, - info['duration'] = int_or_none(meta[5]) if len_ >= 5 else None - info['thumbnails'] = [{'url': meta[14]}] if len_ >= 14 else [] + # ['Main Artist', 'Feat. Artist'] + 'artists': traverse_obj( + (*meta[17], *meta[18]) if len_ >= 18 else None, + (..., 'name'), default=[artist]), - # meta[30] is 2 bits - # most significant: isExplicit - # least significant: isForeignAgent - # i. e. - # 00 = safe - # 01 = marked by RKN as "foreign agent" - # 10 = explicit lyrics - # 11 = both E lyrics and "foreign agent" - if len_ >= 30 and meta[30]: - info['age_limit'] = 18 + 'duration': int_or_none(meta[5]) if len_ >= 5 else None, + 'thumbnails': [{'url': meta[14]}] if len_ >= 14 else [], - return info + # meta[30] is 2 bits + # most significant: isExplicit + # least significant: isForeignAgent + # i. e. + # 00 = safe + # 01 = marked by RKN as "foreign agent" + # 10 = explicit lyrics + # 11 = both E lyrics and "foreign agent" + 'age_limit': 18 if len_ >= 30 and meta[30] else None, + } def _raise_if_blocked(self, meta, track_id): reason = traverse_obj( From ae8e14dcc310ffbd0010303a781b5c51c7346d59 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 18:04:59 +0400 Subject: [PATCH 39/51] fix: age_limit, tests --- yt_dlp/extractor/vk.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 4f16797371..0942750b9b 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -823,7 +823,7 @@ def _parse_track_meta(self, meta, track_id=None): # 01 = marked by RKN as "foreign agent" # 10 = explicit lyrics # 11 = both E lyrics and "foreign agent" - 'age_limit': 18 if len_ >= 30 and meta[30] else None, + 'age_limit': 18 if len_ >= 30 and meta[30] else 0, } def _raise_if_blocked(self, meta, track_id): @@ -864,6 +864,7 @@ class VKMusicTrackIE(VKMusicBaseIE): 'artists': ['Skillet'], 'duration': 230, 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -881,11 +882,29 @@ class VKMusicTrackIE(VKMusicBaseIE): 'artists': ['Pusha T', 'Stormzy'], 'duration': 211, 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 0, }, 'params': { 'skip_download': True, }, }, + { + 'url': 'https://vk.com/audio-2001533203_5533203', + 'info_dict': { + 'id': '-2001533203_5533203', + 'ext': 'm4a', + 'title': 'Linkin Park feat. Page Hamilton - All for Nothing (feat. Page Hamilton)', + 'track': 'All for Nothing (feat. Page Hamilton)', + 'uploader': 'Linkin Park feat. Page Hamilton', + 'artists': ['Linkin Park', 'Page Hamilton'], + 'duration': 213, + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + } + }, { 'note': 'meta is AudioPlayerBlock__root[data-exec], no artists in 17/18', 'skip': 'authentication required', @@ -916,6 +935,7 @@ class VKMusicTrackIE(VKMusicBaseIE): 'artists': ['Jack Thomas', 'Nico & Vinz'], 'duration': 207, 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 0, }, 'params': { 'skip_download': True, From ccf39d3c1d5eb28ce6520ac480023d3361ba9d0f Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 18:07:18 +0400 Subject: [PATCH 40/51] fix: build a correct url instead of requesting with original --- yt_dlp/extractor/vk.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 0942750b9b..3468c4b464 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -872,7 +872,7 @@ class VKMusicTrackIE(VKMusicBaseIE): }, { 'note': 'artists are in meta[17], 18th item contains empty string', - 'url': 'https://vk.com/audio-2001844083_29844083', + 'url': 'https://m.vk.com/audio-2001844083_29844083', 'info_dict': { 'id': '-2001844083_29844083', 'ext': 'm4a', @@ -889,7 +889,7 @@ class VKMusicTrackIE(VKMusicBaseIE): }, }, { - 'url': 'https://vk.com/audio-2001533203_5533203', + 'url': 'https://m.vk.com/audio-2001533203_5533203', 'info_dict': { 'id': '-2001533203_5533203', 'ext': 'm4a', @@ -949,7 +949,9 @@ def _real_extract(self, url): access_hash = mobj.group('hash') if not access_hash: - webpage = self._download_webpage(url, track_id) + webpage = self._download_webpage( + f'https://vk.com/audio{track_id}', + track_id) data_audio = self._search_regex( r'data-audio="([^"]+)', webpage, 'data-audio attr', From 97703b4aaa464e8a7ab66c4ed7facccbbcc7cc6b Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 18:12:57 +0400 Subject: [PATCH 41/51] fix(test): playlist thumbnails info dict key --- yt_dlp/extractor/vk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 3468c4b464..db1b5afff8 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -1031,7 +1031,7 @@ class VKMusicPlaylistIE(VKMusicBaseIE): 'album': 'One More Light', 'uploader': 'Linkin Park', 'artists': ['Linkin Park'], - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnails': [{'url': r're:https?://.*\.jpg'}], 'genres': ['Alternative'], 'release_year': 2017, 'modified_timestamp': int, @@ -1050,7 +1050,7 @@ class VKMusicPlaylistIE(VKMusicBaseIE): 'title': 'VK Fest 2024: Белая сцена', 'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0', 'uploader': 'VK Музыка', - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnails': [{'url': r're:https?://.*\.jpg'}], 'modified_timestamp': int, 'modified_date': str, 'view_count': int, From 96e51391952584821ea7e289bdfdc48f5ffb6a2d Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 18:17:44 +0400 Subject: [PATCH 42/51] test: add access_hash to previously skipped test --- yt_dlp/extractor/vk.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index db1b5afff8..2fe9500afc 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -907,8 +907,7 @@ class VKMusicTrackIE(VKMusicBaseIE): }, { 'note': 'meta is AudioPlayerBlock__root[data-exec], no artists in 17/18', - 'skip': 'authentication required', - 'url': 'https://vk.com/audio-26549346_456239443', + 'url': 'https://vk.com/audio-26549346_456239443_59159cef5d080f5450', 'info_dict': { 'id': '-26549346_456239443', 'ext': 'm4a', @@ -917,7 +916,8 @@ class VKMusicTrackIE(VKMusicBaseIE): 'uploader': 'Fairie\'s Death Waltz', 'artists': ['Fairie\'s Death Waltz'], 'duration': 349, - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnail': '', # TODO: skip incorrect URLs + 'age_limit': 0, }, 'params': { 'skip_download': True, From da6fd0d32bcf4bd1674cb7b16f771f2404423764 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 18:27:43 +0400 Subject: [PATCH 43/51] fix: url_or_none for thumbnails --- yt_dlp/extractor/vk.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 2fe9500afc..4689622f08 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -798,6 +798,9 @@ def _parse_track_meta(self, meta, track_id=None): # artists in one string, may include "feat." artist = unescapeHTML(meta[4]) if len_ >= 4 else None + # album cover art url + thumbnail = url_or_none(meta[14] if len_ >= 14 else None) + return { 'id': (f'{meta[1]}_{meta[0]}' if len_ >= 2 and meta[1] and meta[0] @@ -813,7 +816,7 @@ def _parse_track_meta(self, meta, track_id=None): (..., 'name'), default=[artist]), 'duration': int_or_none(meta[5]) if len_ >= 5 else None, - 'thumbnails': [{'url': meta[14]}] if len_ >= 14 else [], + 'thumbnails': [{'url': thumbnail}] if thumbnail else [], # meta[30] is 2 bits # most significant: isExplicit @@ -906,7 +909,6 @@ class VKMusicTrackIE(VKMusicBaseIE): } }, { - 'note': 'meta is AudioPlayerBlock__root[data-exec], no artists in 17/18', 'url': 'https://vk.com/audio-26549346_456239443_59159cef5d080f5450', 'info_dict': { 'id': '-26549346_456239443', @@ -916,7 +918,6 @@ class VKMusicTrackIE(VKMusicBaseIE): 'uploader': 'Fairie\'s Death Waltz', 'artists': ['Fairie\'s Death Waltz'], 'duration': 349, - 'thumbnail': '', # TODO: skip incorrect URLs 'age_limit': 0, }, 'params': { @@ -1094,14 +1095,17 @@ def _real_extract(self, url): entries.append(self.url_result( audio_url, VKMusicTrackIE, track_id, title, **info)) - title = unescapeHTML(meta.get('title')) # TODO: fallback + title = unescapeHTML(meta.get('title')) artist = unescapeHTML(meta.get('authorName')) + genre, year = self._search_regex( r'^([^<]+)<\s*span[^>]*>[^<]*(\d+)$', meta.get('infoLine1'), 'genre and release year', default=(None, None), fatal=False, group=(1, 2)) is_album = year is not None + thumbnail = url_or_none(meta.get('coverUrl')) + return self.playlist_result( entries, playlist_id, join_nonempty(artist, title, delim=' - ') if is_album else title, @@ -1109,7 +1113,7 @@ def _real_extract(self, url): album=title if is_album else None, uploader=artist, artists=[artist] if is_album else None, - thumbnails=traverse_obj(meta, ({'url': 'coverUrl'}, {lambda obj: [obj]})), + thumbnails=[{'url': thumbnail}] if thumbnail else [], genres=[unescapeHTML(genre)] if genre else None, release_year=int_or_none(year), modified_timestamp=int_or_none(meta.get('lastUpdated')), From 8df9e488ee7049ba5891d3dd21c8c549d1821095 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 18:35:17 +0400 Subject: [PATCH 44/51] style: hatch fix --- yt_dlp/extractor/vk.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 4689622f08..05b9bfa4a7 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -803,8 +803,8 @@ def _parse_track_meta(self, meta, track_id=None): return { 'id': (f'{meta[1]}_{meta[0]}' - if len_ >= 2 and meta[1] and meta[0] - else track_id), + if len_ >= 2 and meta[1] and meta[0] + else track_id), 'title': join_nonempty(artist, title, delim=' - '), 'track': title, @@ -906,7 +906,7 @@ class VKMusicTrackIE(VKMusicBaseIE): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'https://vk.com/audio-26549346_456239443_59159cef5d080f5450', From 44ed9058e9145f3a346768eeec48c62cc922d3d9 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Fri, 4 Apr 2025 23:21:15 +0400 Subject: [PATCH 45/51] feat: rewrite playlist parser for html webpage, add test --- yt_dlp/extractor/vk.py | 97 +++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 38 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 05b9bfa4a7..95d08425f2 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -14,6 +14,7 @@ UserNotLive, clean_html, get_element_by_class, + get_element_html_by_class, get_element_html_by_id, int_or_none, join_nonempty, @@ -1028,16 +1029,12 @@ class VKMusicPlaylistIE(VKMusicBaseIE): 'info_dict': { 'id': '-2000984503_984503', 'title': 'Linkin Park - One More Light', - 'description': '', 'album': 'One More Light', 'uploader': 'Linkin Park', 'artists': ['Linkin Park'], 'thumbnails': [{'url': r're:https?://.*\.jpg'}], 'genres': ['Alternative'], 'release_year': 2017, - 'modified_timestamp': int, - 'modified_date': str, - 'view_count': int, }, 'playlist_count': 10, 'params': { @@ -1052,72 +1049,96 @@ class VKMusicPlaylistIE(VKMusicBaseIE): 'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0', 'uploader': 'VK Музыка', 'thumbnails': [{'url': r're:https?://.*\.jpg'}], - 'modified_timestamp': int, - 'modified_date': str, - 'view_count': int, }, 'playlist_count': 18, 'params': { 'skip_download': True, }, }, + { + 'url': 'https://vk.com/music/playlist/-147845620_2206_876b2b81e867a433c2', + 'info_dict': { + 'id': '-147845620_2206', + 'title': 'Электроника: новинки', + 'description': 're:^Актуальные новинки электронной музыки', + 'uploader': 'VK Музыка', + 'thumbnails': [{'url': r're:https?://.*\.jpg'}], + }, + 'playlist_mincount': 100, + 'params': { + 'extract_flat': True, # DO NOT process 150+ entries + 'skip_download': True, + }, + }, ] def _real_extract(self, url): mobj = self._match_valid_url(url) playlist_id = mobj.group('full_id') + access_hash = mobj.group('hash') - meta = self._download_payload('al_audio', playlist_id, { - 'act': 'load_section', - 'access_hash': mobj.group('hash') or '', - 'claim': '0', - 'context': '', - 'from_id': self._parse_vk_id(), - 'is_loading_all': '1', - 'is_preload': '0', - 'offset': '0', - 'owner_id': mobj.group('oid'), - 'playlist_id': mobj.group('id'), - 'ref': '', - 'type': 'playlist', - })[0] - tracks = meta['list'] + hash_in_url = f'_{access_hash}' if access_hash else '' + webpage = self._download_webpage( + f'https://vk.com/music/album/{playlist_id}{hash_in_url}', + playlist_id) + del hash_in_url + + html = get_element_html_by_class('AudioPlaylistSnippet', webpage) + del webpage entries = [] - for ent in tracks: - info = self._parse_track_meta(ent) + + for mobj in re.finditer(r'data-audio="([^"]+)', html): + meta = self._parse_json( + unescapeHTML(mobj.group(1)), + playlist_id, fatal=False) + if not meta: + continue + + info = self._parse_track_meta(meta) track_id = info.pop('id') title = info.pop('title') - ent_hash = f'_{ent[24]}' if len(ent) >= 24 and ent[24] else '' - audio_url = f'https://vk.com/audio{track_id}{ent_hash}' + hash_in_url = f'_{meta[24]}' if len(meta) >= 24 and meta[24] else '' + audio_url = f'https://vk.com/audio{track_id}{hash_in_url}' entries.append(self.url_result( audio_url, VKMusicTrackIE, track_id, title, **info)) - title = unescapeHTML(meta.get('title')) - artist = unescapeHTML(meta.get('authorName')) + title = self._html_search_regex( + r'class="[^"]*AudioPlaylistSnippet__title--main[^"]*"[^>]*>([^<]+)', + html, 'playlist title', fatal=False, group=1) + + artist = self._html_search_regex( + r'class="[^"]*AudioPlaylistSnippet__author[^"]*"[^>]*>\s*]*)?>([^<]+)', + html, 'playlist author', fatal=False, group=1) + + description = clean_html(get_element_by_class( + 'AudioPlaylistSnippet__description', html)) + # description = self._html_search_regex( + # r'div\s[^>]*class="[^"]*AudioPlaylistSnippet__description[^"]*">??????', + # html, 'playlist description', fatal=False, group=1) + + genre, year = self._html_search_regex( + r'class="[^"]*AudioPlaylistSnippet__info[^"]*"[^>]*>\s*(.+) .*;(\d+)\s*]*>[^<]*(\d+)$', - meta.get('infoLine1'), 'genre and release year', - default=(None, None), fatal=False, group=(1, 2)) is_album = year is not None - thumbnail = url_or_none(meta.get('coverUrl')) + thumbnail = url_or_none(self._html_search_regex( + r'class="[^"]*AudioPlaylistSnippet__cover[^"]*"[^>]*style="background-image\s*:\s*url\s*\(\s*\'([^\']+)', + html, 'playlist thumbnail', fatal=False, group=1)) return self.playlist_result( entries, playlist_id, join_nonempty(artist, title, delim=' - ') if is_album else title, - unescapeHTML(meta.get('rawDescription')), + description, album=title if is_album else None, uploader=artist, artists=[artist] if is_album else None, thumbnails=[{'url': thumbnail}] if thumbnail else [], - genres=[unescapeHTML(genre)] if genre else None, - release_year=int_or_none(year), - modified_timestamp=int_or_none(meta.get('lastUpdated')), - view_count=int_or_none(meta.get('listens'))) + genres=[genre] if genre else None, + release_year=int_or_none(year)) class VKPlayBaseIE(InfoExtractor): From 70e0c591be87a511578e9a1f5d475fbe9a170dcf Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Mon, 7 Apr 2025 13:19:30 +0400 Subject: [PATCH 46/51] perf: `default=`->`or` to avoid creating useless lists --- yt_dlp/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 95d08425f2..dad19d9944 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -814,7 +814,7 @@ def _parse_track_meta(self, meta, track_id=None): # ['Main Artist', 'Feat. Artist'] 'artists': traverse_obj( (*meta[17], *meta[18]) if len_ >= 18 else None, - (..., 'name'), default=[artist]), + (..., 'name')) or [artist], 'duration': int_or_none(meta[5]) if len_ >= 5 else None, 'thumbnails': [{'url': thumbnail}] if thumbnail else [], From cdbfe3a79311078ec73753763b34570332448f06 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Mon, 7 Apr 2025 13:51:39 +0400 Subject: [PATCH 47/51] fix: custom regexs -> yt-dlp html helpers (for reliability) --- yt_dlp/extractor/vk.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index dad19d9944..06f7849e5c 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -13,6 +13,7 @@ ExtractorError, UserNotLive, clean_html, + extract_attributes, get_element_by_class, get_element_html_by_class, get_element_html_by_id, @@ -1083,6 +1084,7 @@ def _real_extract(self, url): playlist_id) del hash_in_url + # to remove big scripts and other elements not used by parser html = get_element_html_by_class('AudioPlaylistSnippet', webpage) del webpage @@ -1105,29 +1107,29 @@ def _real_extract(self, url): entries.append(self.url_result( audio_url, VKMusicTrackIE, track_id, title, **info)) - title = self._html_search_regex( - r'class="[^"]*AudioPlaylistSnippet__title--main[^"]*"[^>]*>([^<]+)', - html, 'playlist title', fatal=False, group=1) + header = get_element_html_by_class('AudioPlaylistSnippet__header', html) - artist = self._html_search_regex( - r'class="[^"]*AudioPlaylistSnippet__author[^"]*"[^>]*>\s*]*)?>([^<]+)', - html, 'playlist author', fatal=False, group=1) + title = clean_html(get_element_by_class('AudioPlaylistSnippet__title', header)) + artist = clean_html(get_element_by_class('AudioPlaylistSnippet__author', header)) - description = clean_html(get_element_by_class( - 'AudioPlaylistSnippet__description', html)) - # description = self._html_search_regex( - # r'div\s[^>]*class="[^"]*AudioPlaylistSnippet__description[^"]*">??????', - # html, 'playlist description', fatal=False, group=1) - - genre, year = self._html_search_regex( - r'class="[^"]*AudioPlaylistSnippet__info[^"]*"[^>]*>\s*(.+) .*;(\d+)\s*]*style="background-image\s*:\s*url\s*\(\s*\'([^\']+)', - html, 'playlist thumbnail', fatal=False, group=1)) + del header + + description = clean_html(get_element_by_class('AudioPlaylistSnippet__description', html)) + + thumbnail = url_or_none(self._search_regex( + r'background[^:;]*:\s*url\s*\(\s*\'([^\']+)', + extract_attributes( + get_element_html_by_class( + 'AudioPlaylistSnippet__cover', + html)).get('style'), + 'playlist thumbnail', fatal=False, group=1)) return self.playlist_result( entries, playlist_id, @@ -1138,7 +1140,7 @@ def _real_extract(self, url): artists=[artist] if is_album else None, thumbnails=[{'url': thumbnail}] if thumbnail else [], genres=[genre] if genre else None, - release_year=int_or_none(year)) + release_year=year) class VKPlayBaseIE(InfoExtractor): From 82b6c1948b19f1f40fcb1f89cbcbc7f1db056537 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Mon, 7 Apr 2025 14:00:12 +0400 Subject: [PATCH 48/51] perf: parse vk_id once --- yt_dlp/extractor/vk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 06f7849e5c..728f423b2f 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -950,6 +950,7 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) track_id = mobj.group('id') access_hash = mobj.group('hash') + vk_id = self._parse_vk_id() if not access_hash: webpage = self._download_webpage( @@ -963,7 +964,7 @@ def _real_extract(self, url): if data_audio: meta = self._parse_json(unescapeHTML(data_audio), track_id) else: - if self._parse_vk_id() == 0: + if vk_id == 0: self.raise_login_required( 'This track is unavailable. ' 'Log in or provide a link with access hash') @@ -996,7 +997,7 @@ def _real_extract(self, url): meta = meta[0] self._raise_if_blocked(meta, track_id) - url = self._unmask_url(meta[2], self._parse_vk_id()) + url = self._unmask_url(meta[2], vk_id) return { **self._parse_track_meta(meta, track_id), From f8e8a31bfec475240840c6b2483eaa68d705008d Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Mon, 7 Apr 2025 14:02:38 +0400 Subject: [PATCH 49/51] fix(test): ensure thumbnail url starts with http --- yt_dlp/extractor/vk.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 728f423b2f..3a179d68d3 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -868,7 +868,7 @@ class VKMusicTrackIE(VKMusicBaseIE): 'uploader': 'Skillet', 'artists': ['Skillet'], 'duration': 230, - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'age_limit': 0, }, 'params': { @@ -886,7 +886,7 @@ class VKMusicTrackIE(VKMusicBaseIE): 'uploader': 'Pusha T, Stormzy', 'artists': ['Pusha T', 'Stormzy'], 'duration': 211, - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'age_limit': 0, }, 'params': { @@ -903,7 +903,7 @@ class VKMusicTrackIE(VKMusicBaseIE): 'uploader': 'Linkin Park feat. Page Hamilton', 'artists': ['Linkin Park', 'Page Hamilton'], 'duration': 213, - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'age_limit': 18, }, 'params': { @@ -937,7 +937,7 @@ class VKMusicTrackIE(VKMusicBaseIE): 'uploader': 'Jack Thomas feat. Nico & Vinz', 'artists': ['Jack Thomas', 'Nico & Vinz'], 'duration': 207, - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'age_limit': 0, }, 'params': { @@ -1034,7 +1034,7 @@ class VKMusicPlaylistIE(VKMusicBaseIE): 'album': 'One More Light', 'uploader': 'Linkin Park', 'artists': ['Linkin Park'], - 'thumbnails': [{'url': r're:https?://.*\.jpg'}], + 'thumbnails': [{'url': r're:^https?://.*\.jpg'}], 'genres': ['Alternative'], 'release_year': 2017, }, @@ -1050,7 +1050,7 @@ class VKMusicPlaylistIE(VKMusicBaseIE): 'title': 'VK Fest 2024: Белая сцена', 'description': 'md5:6d652551bb1faaddbcd46321a77fa8d0', 'uploader': 'VK Музыка', - 'thumbnails': [{'url': r're:https?://.*\.jpg'}], + 'thumbnails': [{'url': r're:^https?://.*\.jpg'}], }, 'playlist_count': 18, 'params': { @@ -1064,7 +1064,7 @@ class VKMusicPlaylistIE(VKMusicBaseIE): 'title': 'Электроника: новинки', 'description': 're:^Актуальные новинки электронной музыки', 'uploader': 'VK Музыка', - 'thumbnails': [{'url': r're:https?://.*\.jpg'}], + 'thumbnails': [{'url': r're:^https?://.*\.jpg'}], }, 'playlist_mincount': 100, 'params': { From bcfdd8e9876f6cf747b9b6bdd4218999303b7e76 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Mon, 7 Apr 2025 14:20:55 +0400 Subject: [PATCH 50/51] test: playlist with htmlescaped in title and author --- yt_dlp/extractor/vk.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 3a179d68d3..c75002efd9 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -1043,6 +1043,20 @@ class VKMusicPlaylistIE(VKMusicBaseIE): 'skip_download': True, }, }, + { + 'note': 'special symbols must be unescaped', + 'url': 'https://vk.com/music/playlist/-25611523_85178143', + 'info_dict': { + 'id': '-25611523_85178143', + 'title': 'Mondträume – Lovers, Sinners & Liars (2019, Alfa Matrix)', + 'uploader': 'E:\\music\\futurepop', + 'thumbnails': [{'url': r're:^https?://.*\.jpg'}], + }, + 'playlist_count': 22, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'https://vk.com/audios877252112?block=playlists§ion=general&z=audio_playlist-147845620_2390', 'info_dict': { From 7df0391049dc598fcb69c8478d3be9fe45a91fd9 Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Mon, 7 Apr 2025 16:20:43 +0400 Subject: [PATCH 51/51] fix: better unavailability check (2) --- yt_dlp/extractor/vk.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index c75002efd9..5141fde93f 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -832,19 +832,20 @@ def _parse_track_meta(self, meta, track_id=None): } def _raise_if_blocked(self, meta, track_id): + if len(meta) < 12: + return None + reason = traverse_obj( - self._parse_json( - meta[12] if len(meta) >= 12 else None, - track_id, fatal=False), + self._parse_json(meta[12], track_id, fatal=False), ('claim', 'reason')) - if reason == 'geo': - self.raise_geo_restricted() - # can be an empty string - elif reason is not None: + if reason is not None: + if reason == 'geo': + self.raise_geo_restricted() + + # an empty string or an internal ID raise ExtractorError( - 'This track is unavailable. ' - f'Reason code: {reason:r}') + f'This track is unavailable. Reason code: {reason:r}') class VKMusicTrackIE(VKMusicBaseIE): @@ -986,16 +987,16 @@ def _real_extract(self, url): access_hash = meta[24] - meta = self._download_payload('al_audio', track_id, { - 'act': 'reload_audios', - 'audio_ids': f'{track_id}_{access_hash}', - })[0] + try: + meta = self._download_payload('al_audio', track_id, { + 'act': 'reload_audios', + 'audio_ids': f'{track_id}_{access_hash}', + })[0][0] + except (ExtractorError, IndexError): + if vk_id == 0: + self.raise_login_required() + raise ExtractorError('This track is unavailable') - # vk sends an empty list when auth required - if not meta: - self.raise_login_required() - - meta = meta[0] self._raise_if_blocked(meta, track_id) url = self._unmask_url(meta[2], vk_id)