diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 171aa9b5c..ada2f495a 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -4177,20 +4177,15 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer # Youtube Music Auto-generated description if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'): - # XXX: Causes catastrophic backtracking if description has "·" - # E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI - # Simulating atomic groups: (?P[^xy]+)x => (?=(?P[^xy]+))(?P=a)x - # reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2 + # Before you change this, learn how regexes work. The last guy didn't. mobj = re.search( r'''(?xs) - (?=(?P[^\n·]+))(?P=track)· - (?=(?P[^\n]+))(?P=artist)\n+ - (?=(?P[^\n]+))(?P=album)\n - (?:.+?℗\s*(?P\d{4})(?!\d))? - (?:.+?Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? - (.+?\nArtist\s*:\s* - (?=(?P[^\n]+))(?P=clean_artist)\n - )?.+\nAuto-generated\ by\ YouTube\.\s*$ + (?:\n|^)(?P[^\n·]+)\ ·\ (?P[^\n]+)\n+ + (?P[^\n]+)\n+ + (?:℗\s*(?P\d{4})[^\n]+\n+)? + (?:Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?.+? + (\nArtist\s*:\s*(?P[^\n]+)\n)? + .+Auto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: release_year = mobj.group('release_year')