From 1b4d0401e47e9cd9340a0dd071285a26ac8674a1 Mon Sep 17 00:00:00 2001 From: TheQWERTYCodr <93845040+TheQWERTYCodr@users.noreply.github.com> Date: Fri, 1 Aug 2025 02:55:53 -0400 Subject: [PATCH 1/6] fix youtube music metadata extraction fixed the metadata extraction regex's catastrophic backtracking, made it faster on all inputs, and added proper support for artists using the middle dot character and now, a rant about properly checking your work and learning how to do shit before you publish changes: simulated atomic groups did not make the regex faster - you added a newline. simulated atomic groups are always (guaranteed!) slower than normal groups and removing them from the old regex makes that regex faster: https://regex101.com/r/8Ssf2h/3 this is fairly obvious to anyone who has actually learned how regexes are matched. the fix is to add a delimiter to the start of the expression: https://regex101.com/r/XqqucW/1 without (?:\n|^), the regex attempts to find a match starting at every possible title character (which is virtually every location) it will then attempt to extend this until it can't do so. for the string "hello", it would have to check "hello", "ello", "llo", "lo", and "o". this is what backtracking is, and it causes quadratic performance in the number of input characters. again, this is fairly obvious to anyone who has actually learned how regexes are matched. i really hope the next person to "improve" this actually takes the time to review their changes before pushing them. --- yt_dlp/extractor/youtube/_video.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 171aa9b5c4..ada2f495ae 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -4177,20 +4177,15 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer # Youtube Music Auto-generated description if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'): - # XXX: Causes catastrophic backtracking if description has "·" - # E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI - # Simulating atomic groups: (?P[^xy]+)x => (?=(?P[^xy]+))(?P=a)x - # reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2 + # Before you change this, learn how regexes work. The last guy didn't. mobj = re.search( r'''(?xs) - (?=(?P[^\n·]+))(?P=track)· - (?=(?P[^\n]+))(?P=artist)\n+ - (?=(?P[^\n]+))(?P=album)\n - (?:.+?℗\s*(?P\d{4})(?!\d))? - (?:.+?Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? - (.+?\nArtist\s*:\s* - (?=(?P[^\n]+))(?P=clean_artist)\n - )?.+\nAuto-generated\ by\ YouTube\.\s*$ + (?:\n|^)(?P[^\n·]+)\ ·\ (?P[^\n]+)\n+ + (?P[^\n]+)\n+ + (?:℗\s*(?P\d{4})[^\n]+\n+)? + (?:Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?.+? + (\nArtist\s*:\s*(?P[^\n]+)\n)? + .+Auto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: release_year = mobj.group('release_year') From 08c7d5379ef58e44ed7b14d2a857e4e2104571da Mon Sep 17 00:00:00 2001 From: TheQWERTYCodr <93845040+TheQWERTYCodr@users.noreply.github.com> Date: Fri, 1 Aug 2025 03:29:14 -0400 Subject: [PATCH 2/6] slight revision _video.py .+ outside the artist group steamrolls any potential artist tag, so we move it inside the artist group --- yt_dlp/extractor/youtube/_video.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index ada2f495ae..75cfbb4903 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -4184,8 +4184,8 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer (?P[^\n]+)\n+ (?:℗\s*(?P\d{4})[^\n]+\n+)? (?:Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?.+? - (\nArtist\s*:\s*(?P[^\n]+)\n)? - .+Auto-generated\ by\ YouTube\.\s*$ + (\nArtist\s*:\s*(?P[^\n]+)\n.+)? + Auto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: release_year = mobj.group('release_year') From 9116f98f499e4852170aa9a6d178903d021b2156 Mon Sep 17 00:00:00 2001 From: TheQWERTYCodr <93845040+TheQWERTYCodr@users.noreply.github.com> Date: Fri, 1 Aug 2025 03:41:38 -0400 Subject: [PATCH 3/6] performance improvement on new regex performance improvement, no functional changes --- yt_dlp/extractor/youtube/_video.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 75cfbb4903..860d546728 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -4183,9 +4183,9 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer (?:\n|^)(?P[^\n·]+)\ ·\ (?P[^\n]+)\n+ (?P[^\n]+)\n+ (?:℗\s*(?P\d{4})[^\n]+\n+)? - (?:Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?.+? - (\nArtist\s*:\s*(?P[^\n]+)\n.+)? - Auto-generated\ by\ YouTube\.\s*$ + (?:Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? + (.+?\nArtist\s*:\s*(?P[^\n]+)\n)? + .+Auto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: release_year = mobj.group('release_year') From 2cc41d1f20213eb6e4a805f8cb401098e329e1b0 Mon Sep 17 00:00:00 2001 From: TheQWERTYCodr <93845040+TheQWERTYCodr@users.noreply.github.com> Date: Fri, 1 Aug 2025 04:09:11 -0400 Subject: [PATCH 4/6] create test for middle dot character MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add half·alive's "Back Around" as a test to make sure middle dot works in artist names --- yt_dlp/extractor/youtube/_video.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 860d546728..5ce65a60e0 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -1758,6 +1758,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'media_type': 'short', }, }, + { + # Youtube Music Auto-generated description with dot in artist name + 'url': 'https://music.youtube.com/watch?v=DbCvuSGfR3Y', + 'info_dict': { + 'id': 'DbCvuSGfR3Y', + 'title': 'Back Around', + 'artists': ['half·alive'], + 'track': 'Back Around', + 'album': 'Conditions Of A Punk', + 'release_date': '20221202', + 'release_year': 2021, + }, + 'params': { + 'skip_download': True, + }, + }, ] _WEBPAGE_TESTS = [ From fb26d5134e26803b9e3c0640ab0f328d549bb363 Mon Sep 17 00:00:00 2001 From: TheQWERTYCodr <93845040+TheQWERTYCodr@users.noreply.github.com> Date: Fri, 1 Aug 2025 07:40:35 -0400 Subject: [PATCH 5/6] update regex to follow coding guidelines make unreferenced group non-capturing --- yt_dlp/extractor/youtube/_video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 5ce65a60e0..e7a018b04c 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -4200,7 +4200,7 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer (?P[^\n]+)\n+ (?:℗\s*(?P\d{4})[^\n]+\n+)? (?:Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? - (.+?\nArtist\s*:\s*(?P[^\n]+)\n)? + (?:.+?\nArtist\s*:\s*(?P[^\n]+)\n)? .+Auto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: From 223b3b75efd6090677ccfd9456ffbd294e161f9a Mon Sep 17 00:00:00 2001 From: theqwertycoder Date: Sun, 3 Aug 2025 20:18:32 -0400 Subject: [PATCH 6/6] =?UTF-8?q?fixes=20to=20extractor=20regex,=20artist=20?= =?UTF-8?q?splitting,=20and=20half=C2=B7alive=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- yt_dlp/extractor/youtube/_video.py | 34 +++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 69fb5043f8..2e75341505 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -1671,12 +1671,37 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://music.youtube.com/watch?v=DbCvuSGfR3Y', 'info_dict': { 'id': 'DbCvuSGfR3Y', + 'ext': 'mp4', 'title': 'Back Around', 'artists': ['half·alive'], 'track': 'Back Around', 'album': 'Conditions Of A Punk', 'release_date': '20221202', 'release_year': 2021, + 'alt_title': 'Back Around', + 'description': 'md5:bfc0e2b3cc903a608d8a85a13cb50f95', + 'media_type': 'video', + 'uploader': 'half•alive', + 'channel': 'half•alive', + 'channel_id': 'UCYQrYophdVI3nVDPOnXyIng', + 'channel_url': 'https://www.youtube.com/channel/UCYQrYophdVI3nVDPOnXyIng', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'duration': 223, + 'thumbnail': 'https://i.ytimg.com/vi_webp/DbCvuSGfR3Y/maxresdefault.webp', + 'heatmap': 'count:100', + 'categories': ['Music'], + 'tags': ['half·alive', 'Conditions Of A Punk', 'Back Around'], + 'creators': ['half·alive'], + 'timestamp': 1669889281, + 'upload_date': '20221201', + 'playable_in_embed': True, + 'availability': 'public', + 'live_status': 'not_live', }, 'params': { 'skip_download': True, @@ -4207,15 +4232,14 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer # Youtube Music Auto-generated description if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'): - # Before you change this, learn how regexes work. The last guy didn't. mobj = re.search( r'''(?xs) (?:\n|^)(?P[^\n·]+)\ ·\ (?P[^\n]+)\n+ (?P[^\n]+)\n+ - (?:℗\s*(?P\d{4})[^\n]+\n+)? - (?:Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? + (?:℗\s*(?P\d{4}))? + (?:.+?\nReleased\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? (?:.+?\nArtist\s*:\s*(?P[^\n]+)\n)? - .+Auto-generated\ by\ YouTube\.\s*$ + .+\nAuto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: release_year = mobj.group('release_year') @@ -4227,7 +4251,7 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer info.update({ 'album': mobj.group('album'.strip()), 'artists': ([a] if (a := mobj.group('clean_artist')) - else [a.strip() for a in mobj.group('artist').split('·')]), + else [a.strip() for a in mobj.group('artist').split(' · ')]), 'track': mobj.group('track').strip(), 'release_date': release_date, 'release_year': int_or_none(release_year),