From 968d8e361323181384bb58313933bde463b31133 Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Wed, 23 Apr 2025 19:35:51 -0400 Subject: [PATCH 1/9] Update _utils.py Add sanitation for Windows legacy devices --- yt_dlp/utils/_utils.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 99d7250876..13e8d149e9 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -167,6 +167,18 @@ def IDENTITY(x): NUMBER_RE = r'\d+(?:\.\d+)?' +WINDOWS_RESERVED_NAMES_RE = fr'({'|'.join( + ("CON", "PRN", "AUX", "CLOCK$", "NUL") + + tuple(f"{name:s}{num:d}" for name, num in itertools.product(("COM", "LPT"), range(0, 10))) + + tuple( + f"{name:s}{ssd:s}" + for name, ssd in itertools.product( + ("COM", "LPT"), + ("\N{SUPERSCRIPT ONE}", "\N{SUPERSCRIPT TWO}", "\N{SUPERSCRIPT THREE}"), + ) + ) +)})' + @functools.cache def preferredencoding(): @@ -679,6 +691,19 @@ def replace_insane(char): return result +def _sanitize_windows_reserved_names(s): + # Append _res to invalid path names + # in order to maintain easy recognizability + # when a user accidentally writes to device files + # - CON.opus => CON_res.opus + def suffix_sanitize(match): + other = match.group(3) if match.group(3) else '' + if not match.group(2) and other: + return match.group(1) + other + return match.group(1) + '_res' + match.group(2) + other # suffix the reserved portion only + return re.sub(fr'{WINDOWS_RESERVED_NAMES_RE}(\.*)(.*$)', suffix_sanitize, s) + + def _sanitize_path_parts(parts): sanitized_parts = [] for part in parts: @@ -694,6 +719,7 @@ def _sanitize_path_parts(parts): # - trailing dots and spaces (`asdf...` => `asdf..#`) # - invalid chars (`<>` => `##`) sanitized_part = re.sub(r'[/<>:"\|\\?\*]|[\s.]$', '#', part) + sanitized_part = _sanitize_windows_reserved_names(sanitized_part) sanitized_parts.append(sanitized_part) return sanitized_parts @@ -713,6 +739,11 @@ def sanitize_path(s, force=False): if normed.startswith('\\\\'): # UNC path (`\\SERVER\SHARE`) or device path (`\\.`, `\\?`) parts = normed.split('\\') + # allow user to write to explicitly declared legacy devices + if len(parts) == 4 and re.fullmatch(WINDOWS_RESERVED_NAMES_RE, parts[3]): + return '\\'.join(parts[:4]) + # sanitize legacy name device otherwise + parts[3] = _sanitize_windows_reserved_names(parts[3]) root = '\\'.join(parts[:4]) + '\\' parts = parts[4:] elif normed[1:2] == ':': From 49311c2db6ec7df0951bfcd9eabbef9b9866329e Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Wed, 23 Apr 2025 19:36:28 -0400 Subject: [PATCH 2/9] Update test_utils.py Add test cases for legacy device sanitation. --- test/test_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index aedb565ec1..2efac25d8f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -249,6 +249,14 @@ def test_sanitize_path(self): self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#') self.assertEqual(sanitize_path('C:\\abc:%(title)s.%(ext)s'), 'C:\\abc#%(title)s.%(ext)s') + self.assertEqual(sanitize_path('CON.opus'), 'CON_res.opus') + self.assertEqual(sanitize_path('abc\\CON\\def'), 'abc\\CON_res\\def') + self.assertEqual(sanitize_path('CON\\abc'), 'CON_res\\abc') + self.assertEqual(sanitize_path('CON.'), 'CON#') + self.assertEqual(sanitize_path('CON..'), 'CON_res.#') + self.assertEqual(sanitize_path('\\\\.\\CON'), '\\\\.\\CON') + self.assertEqual(sanitize_path('\\\\.\\CON\\abc'), '\\\\.\\CON_res\\abc') + # Check with nt._path_normpath if available try: from nt import _path_normpath as nt_path_normpath From ffe93ff484027c7683b4471b15116bf324a11212 Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Wed, 23 Apr 2025 19:46:06 -0400 Subject: [PATCH 3/9] Update _utils.py Add more reserved names. --- yt_dlp/utils/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 13e8d149e9..1de901b1b7 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -168,7 +168,7 @@ def IDENTITY(x): NUMBER_RE = r'\d+(?:\.\d+)?' WINDOWS_RESERVED_NAMES_RE = fr'({'|'.join( - ("CON", "PRN", "AUX", "CLOCK$", "NUL") + ("CON", "CONOUT$", "CONIN$", "PRN", "AUX", "CLOCK$", "NUL") + tuple(f"{name:s}{num:d}" for name, num in itertools.product(("COM", "LPT"), range(0, 10))) + tuple( f"{name:s}{ssd:s}" From 126292524e03431dd4ceda0f53f4ea7af7ac0738 Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Wed, 23 Apr 2025 20:02:15 -0400 Subject: [PATCH 4/9] Update _utils.py Resolve string termination issue. --- yt_dlp/utils/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 1de901b1b7..d9d0e9040c 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -167,7 +167,7 @@ def IDENTITY(x): NUMBER_RE = r'\d+(?:\.\d+)?' -WINDOWS_RESERVED_NAMES_RE = fr'({'|'.join( +WINDOWS_RESERVED_NAMES_RE = fr'({"|".join( ("CON", "CONOUT$", "CONIN$", "PRN", "AUX", "CLOCK$", "NUL") + tuple(f"{name:s}{num:d}" for name, num in itertools.product(("COM", "LPT"), range(0, 10))) + tuple( From 6880a506715b7d7d1cf45bb1f8ecc55d1a700b32 Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Wed, 23 Apr 2025 20:06:09 -0400 Subject: [PATCH 5/9] Update _utils.py Resolve unterminated string literal --- yt_dlp/utils/_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index d9d0e9040c..1d8c7e1cff 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -167,7 +167,7 @@ def IDENTITY(x): NUMBER_RE = r'\d+(?:\.\d+)?' -WINDOWS_RESERVED_NAMES_RE = fr'({"|".join( +WINDOWS_RESERVED_NAMES = ( ("CON", "CONOUT$", "CONIN$", "PRN", "AUX", "CLOCK$", "NUL") + tuple(f"{name:s}{num:d}" for name, num in itertools.product(("COM", "LPT"), range(0, 10))) + tuple( @@ -177,7 +177,8 @@ def IDENTITY(x): ("\N{SUPERSCRIPT ONE}", "\N{SUPERSCRIPT TWO}", "\N{SUPERSCRIPT THREE}"), ) ) -)})' +) +WINDOWS_RESERVED_NAMES_RE = fr'({"|".join(WINDOWS_RESERVED_NAMES)})' @functools.cache From 2a9b19d8db5b226447ccaee8c6a5621258d0abb5 Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Wed, 23 Apr 2025 20:23:13 -0400 Subject: [PATCH 6/9] Update _utils.py Resolve additional ruff errors. --- yt_dlp/utils/_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 1d8c7e1cff..579edccf82 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -168,13 +168,13 @@ def IDENTITY(x): NUMBER_RE = r'\d+(?:\.\d+)?' WINDOWS_RESERVED_NAMES = ( - ("CON", "CONOUT$", "CONIN$", "PRN", "AUX", "CLOCK$", "NUL") - + tuple(f"{name:s}{num:d}" for name, num in itertools.product(("COM", "LPT"), range(0, 10))) - + tuple( - f"{name:s}{ssd:s}" + 'CON', 'CONOUT$', 'CONIN$', 'PRN', 'AUX', 'CLOCK$', 'NUL', + *tuple(f'{name:s}{num:d}' for name, num in itertools.product(('COM', 'LPT'), range(10))), + *tuple( + f'{name:s}{ssd:s}' for name, ssd in itertools.product( - ("COM", "LPT"), - ("\N{SUPERSCRIPT ONE}", "\N{SUPERSCRIPT TWO}", "\N{SUPERSCRIPT THREE}"), + ('COM', 'LPT'), + ('\N{SUPERSCRIPT ONE}', '\N{SUPERSCRIPT TWO}', '\N{SUPERSCRIPT THREE}'), ) ) ) From 6b48a0e4b4adacc8466dd2f664dff5503af1f37b Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Wed, 23 Apr 2025 20:24:27 -0400 Subject: [PATCH 7/9] Update _utils.py Add trailing comma. --- yt_dlp/utils/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 579edccf82..e7f0b318b4 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -176,7 +176,7 @@ def IDENTITY(x): ('COM', 'LPT'), ('\N{SUPERSCRIPT ONE}', '\N{SUPERSCRIPT TWO}', '\N{SUPERSCRIPT THREE}'), ) - ) + ), ) WINDOWS_RESERVED_NAMES_RE = fr'({"|".join(WINDOWS_RESERVED_NAMES)})' From 8399e09e7fbd65cbfa3abed917fcab91ceda4b12 Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Wed, 23 Apr 2025 20:31:39 -0400 Subject: [PATCH 8/9] Update _utils.py More fixes.... I apologize for the excessive CI pipeline usage. --- yt_dlp/utils/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index e7f0b318b4..4a598bfb45 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -701,7 +701,7 @@ def suffix_sanitize(match): other = match.group(3) if match.group(3) else '' if not match.group(2) and other: return match.group(1) + other - return match.group(1) + '_res' + match.group(2) + other # suffix the reserved portion only + return match.group(1) + '_res' + match.group(2) + other # suffix the reserved portion only return re.sub(fr'{WINDOWS_RESERVED_NAMES_RE}(\.*)(.*$)', suffix_sanitize, s) From 1d0f19614b695aba36ad0fe0f8d3d87f68733b85 Mon Sep 17 00:00:00 2001 From: Alan Xiao Date: Fri, 25 Apr 2025 10:03:02 -0400 Subject: [PATCH 9/9] Update _utils.py Remove CLOCK$ as it is a valid filename in Win10. It was reserved in NT or older systems as referenced here: https://answers.microsoft.com/en-us/windows/forum/all/folder-names/2f9bb53a-da99-45e6-90f7-50d1399842aa. Testing on Win10 VM also shows that it's possible to create files/directories with it as a name. --- yt_dlp/utils/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 4a598bfb45..99a67ba61f 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -168,7 +168,7 @@ def IDENTITY(x): NUMBER_RE = r'\d+(?:\.\d+)?' WINDOWS_RESERVED_NAMES = ( - 'CON', 'CONOUT$', 'CONIN$', 'PRN', 'AUX', 'CLOCK$', 'NUL', + 'CON', 'CONOUT$', 'CONIN$', 'PRN', 'AUX', 'NUL', *tuple(f'{name:s}{num:d}' for name, num in itertools.product(('COM', 'LPT'), range(10))), *tuple( f'{name:s}{ssd:s}'