diff --git a/test/test_utils.py b/test/test_utils.py
index aedb565ec..87aba4135 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1786,6 +1786,9 @@ def test_get_element_html_by_class(self):
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
@@ -1880,6 +1905,10 @@ def test_get_element_text_and_html_by_tag(self):
(self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+ html = self.VOID_ELEMENT_TEST_STRING
+
+ self.assertEqual(get_element_text_and_html_by_tag('img', html), ('', '

'))
+
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index 20aa341ca..0419dad23 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -167,6 +167,12 @@ def IDENTITY(x):
NUMBER_RE = r'\d+(?:\.\d+)?'
+VOID_ELEMENTS = [
+ 'area', 'base', 'br', 'col', 'embed',
+ 'hr', 'img', 'input', 'link', 'meta',
+ 'param', 'source', 'track', 'wbr',
+]
+
@functools.cache
def preferredencoding():
@@ -364,15 +370,13 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
if not value:
return
- quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
-
value = re.escape(value) if escape_value else value
partial_element_re = rf'''(?x)
<(?P
{tag})
- (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
- \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
- '''
+ (?:\s[^>"']*|"[^"]*"|'[^']*')*?
+ \s{re.escape(attribute)}\s*=\s*(?P['"])?(?-x:{value})(?(q)(?P=q)|(?=[\s/>]))
+ '''
for m in re.finditer(partial_element_re, html):
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
@@ -436,12 +440,17 @@ def find_or_raise(haystack, needle, exc):
return haystack.index(needle)
except ValueError:
raise exc
- closing_tag = f'{tag}>'
+
whole_start = find_or_raise(
html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
content_start = find_or_raise(
html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
content_start += whole_start + 1
+
+ if tag in VOID_ELEMENTS:
+ return '', html[whole_start:content_start]
+
+ closing_tag = f'{tag}>'
with HTMLBreakOnClosingTagParser() as parser:
parser.feed(html[whole_start:content_start])
if not parser.tagstack or parser.tagstack[0] != tag: