mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-07-03 03:48:31 +00:00
[utils] get_element_text_and_html_by_tag: Support void elements
This commit is contained in:
parent
b26bc32579
commit
b497cf7123
@ -167,6 +167,12 @@ def IDENTITY(x):
|
|||||||
|
|
||||||
NUMBER_RE = r'\d+(?:\.\d+)?'
|
NUMBER_RE = r'\d+(?:\.\d+)?'
|
||||||
|
|
||||||
|
VOID_ELEMENTS = [
|
||||||
|
'area', 'base', 'br', 'col', 'embed',
|
||||||
|
'hr', 'img', 'input', 'link', 'meta',
|
||||||
|
'param', 'source', 'track', 'wbr',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def preferredencoding():
|
def preferredencoding():
|
||||||
@ -364,15 +370,13 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
|
|||||||
if not value:
|
if not value:
|
||||||
return
|
return
|
||||||
|
|
||||||
quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
|
|
||||||
|
|
||||||
value = re.escape(value) if escape_value else value
|
value = re.escape(value) if escape_value else value
|
||||||
|
|
||||||
partial_element_re = rf'''(?x)
|
partial_element_re = rf'''(?x)
|
||||||
<(?P<tag>{tag})
|
<(?P<tag>{tag})
|
||||||
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
|
(?:\s[^>"']*|"[^"]*"|'[^']*')*?
|
||||||
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
|
\s{re.escape(attribute)}\s*=\s*(?P<q>['"])?(?-x:{value})(?(q)(?P=q)|(?=[\s/>]))
|
||||||
'''
|
'''
|
||||||
|
|
||||||
for m in re.finditer(partial_element_re, html):
|
for m in re.finditer(partial_element_re, html):
|
||||||
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
|
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
|
||||||
@ -436,12 +440,17 @@ def find_or_raise(haystack, needle, exc):
|
|||||||
return haystack.index(needle)
|
return haystack.index(needle)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise exc
|
raise exc
|
||||||
closing_tag = f'</{tag}>'
|
|
||||||
whole_start = find_or_raise(
|
whole_start = find_or_raise(
|
||||||
html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
|
html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
|
||||||
content_start = find_or_raise(
|
content_start = find_or_raise(
|
||||||
html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
|
html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
|
||||||
content_start += whole_start + 1
|
content_start += whole_start + 1
|
||||||
|
|
||||||
|
if tag in VOID_ELEMENTS:
|
||||||
|
return '', html[whole_start:content_start]
|
||||||
|
|
||||||
|
closing_tag = f'</{tag}>'
|
||||||
with HTMLBreakOnClosingTagParser() as parser:
|
with HTMLBreakOnClosingTagParser() as parser:
|
||||||
parser.feed(html[whole_start:content_start])
|
parser.feed(html[whole_start:content_start])
|
||||||
if not parser.tagstack or parser.tagstack[0] != tag:
|
if not parser.tagstack or parser.tagstack[0] != tag:
|
||||||
|
Loading…
Reference in New Issue
Block a user