mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[extractor/youtube] Improve description parsing performance (#7315)
* The parsing is skipped when not needed * The regex is improved by simulating atomic groups with lookaheads Authored by: pukkandan, berkanteber
This commit is contained in:
		| @@ -4346,15 +4346,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | ||||
|                         info[d_k] = parse_duration(query[k][0]) | ||||
| 
 | ||||
|         # Youtube Music Auto-generated description | ||||
|         if video_description: | ||||
|         if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'): | ||||
|             # XXX: Causes catastrophic backtracking if description has "·" | ||||
|             # E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI | ||||
|             # Simulating atomic groups:  (?P<a>[^xy]+)x  =>  (?=(?P<a>[^xy]+))(?P=a)x | ||||
|             # reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2 | ||||
|             mobj = re.search( | ||||
|                 r'''(?xs) | ||||
|                     (?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+ | ||||
|                     (?P<album>[^\n]+) | ||||
|                     (?=(?P<track>[^\n·]+))(?P=track)· | ||||
|                     (?=(?P<artist>[^\n]+))(?P=artist)\n+ | ||||
|                     (?=(?P<album>[^\n]+))(?P=album)\n | ||||
|                     (?:.+?℗\s*(?P<release_year>\d{4})(?!\d))? | ||||
|                     (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))? | ||||
|                     (.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))? | ||||
|                     .+\nAuto-generated\ by\ YouTube\.\s*$ | ||||
|                     (.+?\nArtist\s*:\s* | ||||
|                         (?=(?P<clean_artist>[^\n]+))(?P=clean_artist)\n | ||||
|                     )?.+\nAuto-generated\ by\ YouTube\.\s*$ | ||||
|                 ''', video_description) | ||||
|             if mobj: | ||||
|                 release_year = mobj.group('release_year') | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Berkan Teber
					Berkan Teber