1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-07-03 03:48:31 +00:00

[utils] get_element_text_and_html_by_tag: Support void elements

This commit is contained in:
doe1080 2025-05-09 11:56:05 +09:00
parent b26bc32579
commit b497cf7123

View File

@ -167,6 +167,12 @@ def IDENTITY(x):
NUMBER_RE = r'\d+(?:\.\d+)?' NUMBER_RE = r'\d+(?:\.\d+)?'
VOID_ELEMENTS = [
'area', 'base', 'br', 'col', 'embed',
'hr', 'img', 'input', 'link', 'meta',
'param', 'source', 'track', 'wbr',
]
@functools.cache @functools.cache
def preferredencoding(): def preferredencoding():
@ -364,15 +370,13 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
if not value: if not value:
return return
quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
value = re.escape(value) if escape_value else value value = re.escape(value) if escape_value else value
partial_element_re = rf'''(?x) partial_element_re = rf'''(?x)
<(?P<tag>{tag}) <(?P<tag>{tag})
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? (?:\s[^>"']*|"[^"]*"|'[^']*')*?
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q) \s{re.escape(attribute)}\s*=\s*(?P<q>['"])?(?-x:{value})(?(q)(?P=q)|(?=[\s/>]))
''' '''
for m in re.finditer(partial_element_re, html): for m in re.finditer(partial_element_re, html):
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
@ -436,12 +440,17 @@ def find_or_raise(haystack, needle, exc):
return haystack.index(needle) return haystack.index(needle)
except ValueError: except ValueError:
raise exc raise exc
closing_tag = f'</{tag}>'
whole_start = find_or_raise( whole_start = find_or_raise(
html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found')) html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
content_start = find_or_raise( content_start = find_or_raise(
html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag')) html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
content_start += whole_start + 1 content_start += whole_start + 1
if tag in VOID_ELEMENTS:
return '', html[whole_start:content_start]
closing_tag = f'</{tag}>'
with HTMLBreakOnClosingTagParser() as parser: with HTMLBreakOnClosingTagParser() as parser:
parser.feed(html[whole_start:content_start]) parser.feed(html[whole_start:content_start])
if not parser.tagstack or parser.tagstack[0] != tag: if not parser.tagstack or parser.tagstack[0] != tag: