From 05285b7e0d585a74c39e07b295b265c020603b9e Mon Sep 17 00:00:00 2001 From: harsh1504660 Date: Sat, 9 Aug 2025 23:29:50 +0530 Subject: [PATCH 1/6] Update _real_extract in BandcampWeeklyIE class to handel the keyerror --- yt_dlp/extractor/bandcamp.py | 88 +++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 0a8f88fa8..4c9c9c148 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -440,39 +440,83 @@ def _real_extract(self, url): blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_data'][show_id] + # Updated to correctly navigate the new data structure + # The data is now in a list under appData['shows'] + shows_list = try_get(blob, lambda x: x['appData']['shows'], list) + show = None + if shows_list: + for s in shows_list: + if str(s.get('showId')) == show_id: + show = s + break + + if not show: + # Fallback to the original logic if the new path fails + show = try_get(blob, lambda x: x['bcw_data'][show_id], dict) + + if not show: + raise ExtractorError('Bandcamp Weekly data not found. This extractor is outdated. Please report this issue.') formats = [] - for format_id, format_url in show['audio_stream'].items(): - if not url_or_none(format_url): - continue - for known_ext in KNOWN_EXTENSIONS: - if known_ext in format_id: - ext = known_ext - break - else: - ext = None - formats.append({ - 'format_id': format_id, - 'url': format_url, - 'ext': ext, - 'vcodec': 'none', - }) + # The audio track ID is now in the 'audioTrackId' key + audio_track_id = str_or_none(show.get('audioTrackId')) - title = show.get('audio_title') or 'Bandcamp Weekly' - subtitle = show.get('subtitle') + # If audio track ID is found, download the audio page to get formats + if audio_track_id: + track_url = f'https://bandcamp.com/download?id={audio_track_id}' + audio_page = self._download_webpage( + track_url, show_id, 'Downloading audio download page') + + # The download links are on the new page, so we need a new way to parse + audio_blob = self._extract_data_attr(audio_page, show_id, 'blob', fatal=False) + if audio_blob: + # The formats are now in the 'downloads' list within the audio_blob + downloads = try_get(audio_blob, lambda x: x['digital_items'][0]['downloads'], dict) + if downloads: + for format_id, f in downloads.items(): + formats.append({ + 'url': f.get('url'), + 'format_id': format_id, + 'ext': f.get('encoding_name'), + 'vcodec': 'none', + }) + + # Fallback to the old logic if new parsing fails + if not formats and show.get('audio_stream'): + for format_id, format_url in show['audio_stream'].items(): + if not url_or_none(format_url): + continue + for known_ext in KNOWN_EXTENSIONS: + if known_ext in format_id: + ext = known_ext + break + else: + ext = None + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': ext, + 'vcodec': 'none', + }) + + # If no formats were found after all attempts, raise an error + if not formats: + raise ExtractorError('Could not find any audio formats for this episode.') + + title = show.get('audio_title') or show.get('title') or 'Bandcamp Weekly' + subtitle = show.get('shortDesc') if subtitle: title += f' - {subtitle}' - + return { 'id': show_id, 'title': title, - 'description': show.get('desc') or show.get('short_desc'), + 'description': show.get('desc') or show.get('shortDesc'), 'duration': float_or_none(show.get('audio_duration')), 'is_live': False, - 'release_date': unified_strdate(show.get('published_date')), + 'release_date': unified_strdate(show.get('date')), 'series': 'Bandcamp Weekly', - 'episode': show.get('subtitle'), + 'episode': show.get('shortDesc'), 'episode_id': show_id, 'formats': formats, } From 3eac0a8bfa3e3c544bc496ba4813a4a2dd9a4d7f Mon Sep 17 00:00:00 2001 From: harsh1504660 Date: Sat, 9 Aug 2025 23:40:21 +0530 Subject: [PATCH 2/6] Removed blankspaces --- yt_dlp/extractor/bandcamp.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 4c9c9c148..29538178b 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -279,8 +279,6 @@ def _real_extract(self, url): 'formats': formats, 'tags': traverse_obj(webpage, ({find_elements(cls='tag')}, ..., {clean_html})), } - - class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com/album/(?P[^/?#&]+)' @@ -407,8 +405,6 @@ def _real_extract(self, url): 'description': current.get('about'), 'entries': entries, } - - class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P\d+)' @@ -520,8 +516,6 @@ def _real_extract(self, url): 'episode_id': show_id, 'formats': formats, } - - class BandcampUserIE(InfoExtractor): IE_NAME = 'Bandcamp:user' _VALID_URL = r'https?://(?!www\.)(?P[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)' From 296b10386f18bf5398bfc80b8484ae63af12d5aa Mon Sep 17 00:00:00 2001 From: harsh1504660 Date: Sat, 9 Aug 2025 23:53:50 +0530 Subject: [PATCH 3/6] Removed blankspaces --- yt_dlp/extractor/bandcamp.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 29538178b..0e3cfaeba 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -145,7 +145,6 @@ class BandcampIE(InfoExtractor): 'uploader_url': 'https://stayinside.bandcamp.com', }, }] - def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): return self._parse_json(self._html_search_regex( rf'data-{attr}=(["\'])({{.+?}})\1', webpage, @@ -463,7 +462,7 @@ def _real_extract(self, url): audio_page = self._download_webpage( track_url, show_id, 'Downloading audio download page') - # The download links are on the new page, so we need a new way to parse + audio_blob = self._extract_data_attr(audio_page, show_id, 'blob', fatal=False) if audio_blob: # The formats are now in the 'downloads' list within the audio_blob @@ -477,7 +476,7 @@ def _real_extract(self, url): 'vcodec': 'none', }) - # Fallback to the old logic if new parsing fails + if not formats and show.get('audio_stream'): for format_id, format_url in show['audio_stream'].items(): if not url_or_none(format_url): @@ -495,7 +494,6 @@ def _real_extract(self, url): 'vcodec': 'none', }) - # If no formats were found after all attempts, raise an error if not formats: raise ExtractorError('Could not find any audio formats for this episode.') @@ -503,7 +501,6 @@ def _real_extract(self, url): subtitle = show.get('shortDesc') if subtitle: title += f' - {subtitle}' - return { 'id': show_id, 'title': title, From 688b76d4052803d2e0073c6f86dc8b8071a9854a Mon Sep 17 00:00:00 2001 From: harsh1504660 Date: Sat, 9 Aug 2025 23:55:42 +0530 Subject: [PATCH 4/6] Removed blankspaces --- yt_dlp/extractor/bandcamp.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 0e3cfaeba..c2defe1bc 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -461,8 +461,6 @@ def _real_extract(self, url): track_url = f'https://bandcamp.com/download?id={audio_track_id}' audio_page = self._download_webpage( track_url, show_id, 'Downloading audio download page') - - audio_blob = self._extract_data_attr(audio_page, show_id, 'blob', fatal=False) if audio_blob: # The formats are now in the 'downloads' list within the audio_blob @@ -475,8 +473,6 @@ def _real_extract(self, url): 'ext': f.get('encoding_name'), 'vcodec': 'none', }) - - if not formats and show.get('audio_stream'): for format_id, format_url in show['audio_stream'].items(): if not url_or_none(format_url): @@ -493,7 +489,6 @@ def _real_extract(self, url): 'ext': ext, 'vcodec': 'none', }) - if not formats: raise ExtractorError('Could not find any audio formats for this episode.') From e26ff3c4827fb556ceef7914a973049402ddf6d2 Mon Sep 17 00:00:00 2001 From: harsh1504660 Date: Sun, 10 Aug 2025 00:01:40 +0530 Subject: [PATCH 5/6] Update 404 not found --- yt_dlp/extractor/bandcamp.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index c2defe1bc..77dff9d67 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -278,6 +278,8 @@ def _real_extract(self, url): 'formats': formats, 'tags': traverse_obj(webpage, ({find_elements(cls='tag')}, ..., {clean_html})), } + + class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com/album/(?P[^/?#&]+)' @@ -404,6 +406,8 @@ def _real_extract(self, url): 'description': current.get('about'), 'entries': entries, } + + class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P\d+)' @@ -508,6 +512,8 @@ def _real_extract(self, url): 'episode_id': show_id, 'formats': formats, } + + class BandcampUserIE(InfoExtractor): IE_NAME = 'Bandcamp:user' _VALID_URL = r'https?://(?!www\.)(?P[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)' From 364ef4b90bb65bf3f0b54dae2a3b0dd61845f7cd Mon Sep 17 00:00:00 2001 From: harsh1504660 Date: Sun, 10 Aug 2025 00:07:47 +0530 Subject: [PATCH 6/6] Update 404 not found --- yt_dlp/extractor/bandcamp.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 77dff9d67..b5d725c48 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -71,7 +71,7 @@ class BandcampIE(InfoExtractor): 'album_artists': ['Ben Prunty'], }, }, { - # no free download, mp3 128 + # track from compilation album (artist/album_artist difference) 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire', 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7', 'info_dict': { @@ -96,7 +96,7 @@ class BandcampIE(InfoExtractor): 'album_artists': ['Mastodon'], }, }, { - # track from compilation album (artist/album_artist difference) + # FIXME: Embed detection 'url': 'https://diskotopia.bandcamp.com/track/safehouse', 'md5': '19c5337bca1428afa54129f86a2f6a69', 'info_dict': { @@ -145,6 +145,7 @@ class BandcampIE(InfoExtractor): 'uploader_url': 'https://stayinside.bandcamp.com', }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): return self._parse_json(self._html_search_regex( rf'data-{attr}=(["\'])({{.+?}})\1', webpage, @@ -439,8 +440,6 @@ def _real_extract(self, url): blob = self._extract_data_attr(webpage, show_id, 'blob') - # Updated to correctly navigate the new data structure - # The data is now in a list under appData['shows'] shows_list = try_get(blob, lambda x: x['appData']['shows'], list) show = None if shows_list: @@ -450,14 +449,13 @@ def _real_extract(self, url): break if not show: - # Fallback to the original logic if the new path fails + show = try_get(blob, lambda x: x['bcw_data'][show_id], dict) if not show: raise ExtractorError('Bandcamp Weekly data not found. This extractor is outdated. Please report this issue.') formats = [] - # The audio track ID is now in the 'audioTrackId' key audio_track_id = str_or_none(show.get('audioTrackId')) # If audio track ID is found, download the audio page to get formats