From b497cf7123a25dd638d1b31cbdaf4d17f39d630f Mon Sep 17 00:00:00 2001
From: doe1080 <98906116+doe1080@users.noreply.github.com>
Date: Fri, 9 May 2025 11:56:05 +0900
Subject: [PATCH 1/2] [utils] get_element_text_and_html_by_tag: Support void
elements
---
yt_dlp/utils/_utils.py | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index 20aa341ca3..0419dad23b 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -167,6 +167,12 @@ def IDENTITY(x):
NUMBER_RE = r'\d+(?:\.\d+)?'
+VOID_ELEMENTS = [
+ 'area', 'base', 'br', 'col', 'embed',
+ 'hr', 'img', 'input', 'link', 'meta',
+ 'param', 'source', 'track', 'wbr',
+]
+
@functools.cache
def preferredencoding():
@@ -364,15 +370,13 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
if not value:
return
- quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
-
value = re.escape(value) if escape_value else value
partial_element_re = rf'''(?x)
<(?P['"])?(?-x:{value})(?(q)(?P=q)|(?=[\s/>]))
+ '''
for m in re.finditer(partial_element_re, html):
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
@@ -436,12 +440,17 @@ def find_or_raise(haystack, needle, exc):
return haystack.index(needle)
except ValueError:
raise exc
- closing_tag = f'{tag}>'
+
whole_start = find_or_raise(
html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
content_start = find_or_raise(
html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
content_start += whole_start + 1
+
+ if tag in VOID_ELEMENTS:
+ return '', html[whole_start:content_start]
+
+ closing_tag = f'{tag}>'
with HTMLBreakOnClosingTagParser() as parser:
parser.feed(html[whole_start:content_start])
if not parser.tagstack or parser.tagstack[0] != tag:
From 8057c858ba971925e8d54a823480d22834c6368c Mon Sep 17 00:00:00 2001
From: doe1080 <98906116+doe1080@users.noreply.github.com>
Date: Sat, 10 May 2025 12:43:44 +0900
Subject: [PATCH 2/2] add test
---
test/test_utils.py | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/test/test_utils.py b/test/test_utils.py
index aedb565ec1..87aba41350 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1786,6 +1786,9 @@ def test_get_element_html_by_class(self):
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
+ '''
def test_get_element_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
@@ -1798,6 +1801,10 @@ def test_get_element_by_attribute(self):
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+ html = self.VOID_ELEMENT_TEST_STRING
+
+ self.assertEqual(get_element_by_attribute('alt', 'foo', html), '')
+
def test_get_element_html_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
@@ -1809,6 +1816,10 @@ def test_get_element_html_by_attribute(self):
self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
+ html = self.VOID_ELEMENT_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('alt', 'foo', html), '
')
+
GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
'''
@@ -1833,6 +1844,10 @@ def test_get_elements_by_attribute(self):
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
+ html = self.VOID_ELEMENT_TEST_STRING
+
+ self.assertEqual(get_elements_by_attribute('alt', 'foo', html), ['', ''])
+
def test_get_elements_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
@@ -1840,6 +1855,11 @@ def test_get_elements_html_by_attribute(self):
self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
+ html = self.VOID_ELEMENT_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_attribute(
+ 'alt', 'foo', html), ['
', '
'])
+
def test_get_elements_text_and_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
@@ -1852,6 +1872,11 @@ def test_get_elements_text_and_html_by_attribute(self):
self.assertEqual(list(get_elements_text_and_html_by_attribute(
'class', 'foo', 'nicenice', tag='a')), [('nice', 'nice')])
+ html = self.VOID_ELEMENT_TEST_STRING
+
+ self.assertEqual(list(get_elements_text_and_html_by_attribute(
+ 'alt', 'foo', html, tag='img')), [('', '
'), ('', '
')])
+
GET_ELEMENT_BY_TAG_TEST_STRING = '''
random text lorem ipsum