diff --git a/test/test_utils.py b/test/test_utils.py index aedb565ec..87aba4135 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1786,6 +1786,9 @@ def test_get_element_html_by_class(self): GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' ''' + VOID_ELEMENT_TEST_STRING = ''' + foofoobarfoo + ''' def test_get_element_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING @@ -1798,6 +1801,10 @@ def test_get_element_by_attribute(self): self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') + html = self.VOID_ELEMENT_TEST_STRING + + self.assertEqual(get_element_by_attribute('alt', 'foo', html), '') + def test_get_element_html_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING @@ -1809,6 +1816,10 @@ def test_get_element_html_by_attribute(self): self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip()) + html = self.VOID_ELEMENT_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('alt', 'foo', html), 'foo') + GET_ELEMENTS_BY_CLASS_TEST_STRING = ''' nicealso nice ''' @@ -1833,6 +1844,10 @@ def test_get_elements_by_attribute(self): self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + html = self.VOID_ELEMENT_TEST_STRING + + self.assertEqual(get_elements_by_attribute('alt', 'foo', html), ['', '']) + def test_get_elements_html_by_attribute(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING @@ -1840,6 +1855,11 @@ def test_get_elements_html_by_attribute(self): self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), []) + html = self.VOID_ELEMENT_TEST_STRING + + self.assertEqual(get_elements_html_by_attribute( + 'alt', 'foo', html), ['foo', 'foo']) + def test_get_elements_text_and_html_by_attribute(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING @@ -1852,6 +1872,11 @@ def test_get_elements_text_and_html_by_attribute(self): self.assertEqual(list(get_elements_text_and_html_by_attribute( 'class', 'foo', 'nicenice', tag='a')), [('nice', 'nice')]) + html = self.VOID_ELEMENT_TEST_STRING + + self.assertEqual(list(get_elements_text_and_html_by_attribute( + 'alt', 'foo', html, tag='img')), [('', 'foo'), ('', 'foo')]) + GET_ELEMENT_BY_TAG_TEST_STRING = ''' random text lorem ipsum

@@ -1880,6 +1905,10 @@ def test_get_element_text_and_html_by_tag(self): (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML)) self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + html = self.VOID_ELEMENT_TEST_STRING + + self.assertEqual(get_element_text_and_html_by_tag('img', html), ('', 'foo')) + def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 20aa341ca..0419dad23 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -167,6 +167,12 @@ def IDENTITY(x): NUMBER_RE = r'\d+(?:\.\d+)?' +VOID_ELEMENTS = [ + 'area', 'base', 'br', 'col', 'embed', + 'hr', 'img', 'input', 'link', 'meta', + 'param', 'source', 'track', 'wbr', +] + @functools.cache def preferredencoding(): @@ -364,15 +370,13 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w if not value: return - quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' - value = re.escape(value) if escape_value else value partial_element_re = rf'''(?x) <(?P{tag}) - (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? - \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q) - ''' + (?:\s[^>"']*|"[^"]*"|'[^']*')*? + \s{re.escape(attribute)}\s*=\s*(?P['"])?(?-x:{value})(?(q)(?P=q)|(?=[\s/>])) + ''' for m in re.finditer(partial_element_re, html): content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) @@ -436,12 +440,17 @@ def find_or_raise(haystack, needle, exc): return haystack.index(needle) except ValueError: raise exc - closing_tag = f'' + whole_start = find_or_raise( html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found')) content_start = find_or_raise( html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag')) content_start += whole_start + 1 + + if tag in VOID_ELEMENTS: + return '', html[whole_start:content_start] + + closing_tag = f'' with HTMLBreakOnClosingTagParser() as parser: parser.feed(html[whole_start:content_start]) if not parser.tagstack or parser.tagstack[0] != tag: