From d3d29be050e6eca0a549a1d73c4344d947109efa Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 12 Jul 2025 23:05:28 +0200 Subject: [PATCH] Smaller rearrangements --- changed.diff | 53 ++++++++++++++++++++++++++++++++++++++ test/test_InfoExtractor.py | 2 -- yt_dlp/extractor/common.py | 30 +++++++-------------- 3 files changed, 62 insertions(+), 23 deletions(-) create mode 100644 changed.diff diff --git a/changed.diff b/changed.diff new file mode 100644 index 000000000..0919017db --- /dev/null +++ b/changed.diff @@ -0,0 +1,53 @@ +diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py +index b2af2a67b..e2b94c89a 100644 +--- a/yt_dlp/extractor/common.py ++++ b/yt_dlp/extractor/common.py +@@ -1807,9 +1807,8 @@ def flatten(flight_data): + flatten(f) + + flight_text = '' +- # The flight segments regex pattern can afford to be (and should be) strict +- # Ref: https://github.com/vercel/next.js/commit/5a4a08fdce91a038f2ed3a70568d3ed040403150 +- # /packages/next/src/server/app-render/use-flight-response.tsx ++ # The non JSON part is written as string in the next.js source and should be matched strictly ++ # Ref: https://github.com/vercel/next.js/blob/5a4a08fdce91a038f2ed3a70568d3ed040403150/packages/next/src/server/app-render/use-flight-response.tsx#L189 + for flight_segment in re.findall(r']*>self\.__next_f\.push\((\[.+?\])\)', webpage): + segment = self._parse_json(flight_segment, video_id, fatal=fatal, errnote=None if fatal else False) + # Some earlier versions of next.js "optimized" away this array structure; this is unsupported +@@ -1818,27 +1817,26 @@ def flatten(flight_data): + self.write_debug( + f'{video_id}: Unsupported next.js flight data structure detected', only_once=True) + continue ++ # Ignore useless payload types (1: data, 2: base64) ++ # Ref: https://github.com/vercel/next.js/blob/5a4a08fdce91a038f2ed3a70568d3ed040403150/packages/next/src/server/app-render/use-flight-response.tsx#L11-#L14 + payload_type, chunk = segment +- if payload_type == 3: ++ if payload_type == 1: ++ flight_text += chunk ++ elif payload_type == 3: + try: +- chunk = base64.b64decode(chunk).decode() ++ flight_text += base64.b64decode(chunk).decode() + except (ValueError, binascii.Error): + msg = 'Unable to parse next.js data: unable to decode flight data' + if not fatal: + self.report_warning(msg, video_id=video_id, only_once=True) + continue + raise ExtractorError(msg) +- elif payload_type != 1: +- # Ignore useless payload types (0: bootstrap, 2: form state) +- continue +- flight_text += chunk + + for f in flight_text.splitlines(): + prefix, _, body = f.partition(':') +- if not (body.startswith('[') and body.endswith(']') and re.fullmatch(r'[0-9a-f]{1,3}', prefix)): +- continue +- # The body isn't necessarily valid JSON; this should always be non-fatal +- flatten(self._parse_json(body, video_id, fatal=False, errnote=False)) ++ if body.startswith('[') and body.endswith(']') and re.fullmatch(r'[0-9a-f]{1,3}', prefix): ++ # The body isn't necessarily valid JSON; this should always be non-fatal ++ flatten(self._parse_json(body, video_id, fatal=False, errnote=False)) + + return nextjs_data + diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 06d08927b..e0b1df912 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1980,8 +1980,6 @@ def test_search_nextjs_v13_data(self): 'duplicated_field_name': {'x': 1}, }, { 'duplicated_field_name': {'y': 2}, - }, { - 'decoded': 'success', }] self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED) self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), []) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b2af2a67b..179ac503d 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1,5 +1,4 @@ import base64 -import binascii import collections import contextlib import functools @@ -1807,9 +1806,8 @@ def flatten(flight_data): flatten(f) flight_text = '' - # The flight segments regex pattern can afford to be (and should be) strict - # Ref: https://github.com/vercel/next.js/commit/5a4a08fdce91a038f2ed3a70568d3ed040403150 - # /packages/next/src/server/app-render/use-flight-response.tsx + # The script part is written as a string in the next.js source and should be matched strictly + # Ref: https://github.com/vercel/next.js/blob/5a4a08fdce91a038f2ed3a70568d3ed040403150/packages/next/src/server/app-render/use-flight-response.tsx#L189 for flight_segment in re.findall(r']*>self\.__next_f\.push\((\[.+?\])\)', webpage): segment = self._parse_json(flight_segment, video_id, fatal=fatal, errnote=None if fatal else False) # Some earlier versions of next.js "optimized" away this array structure; this is unsupported @@ -1818,27 +1816,17 @@ def flatten(flight_data): self.write_debug( f'{video_id}: Unsupported next.js flight data structure detected', only_once=True) continue + # Use only relevant payload type (1 == data) + # Ref: https://github.com/vercel/next.js/blob/5a4a08fdce91a038f2ed3a70568d3ed040403150/packages/next/src/server/app-render/use-flight-response.tsx#L11-#L14 payload_type, chunk = segment - if payload_type == 3: - try: - chunk = base64.b64decode(chunk).decode() - except (ValueError, binascii.Error): - msg = 'Unable to parse next.js data: unable to decode flight data' - if not fatal: - self.report_warning(msg, video_id=video_id, only_once=True) - continue - raise ExtractorError(msg) - elif payload_type != 1: - # Ignore useless payload types (0: bootstrap, 2: form state) - continue - flight_text += chunk + if payload_type == 1: + flight_text += chunk for f in flight_text.splitlines(): prefix, _, body = f.partition(':') - if not (body.startswith('[') and body.endswith(']') and re.fullmatch(r'[0-9a-f]{1,3}', prefix)): - continue - # The body isn't necessarily valid JSON; this should always be non-fatal - flatten(self._parse_json(body, video_id, fatal=False, errnote=False)) + if body.startswith('[') and body.endswith(']') and re.fullmatch(r'[0-9a-f]{1,3}', prefix): + # The body isn't necessarily valid JSON; this should always be non-fatal + flatten(self._parse_json(body, video_id, fatal=False, errnote=False)) return nextjs_data