mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-06-27 17:08:32 +00:00
Merge 8057c858ba
into 73bf102116
This commit is contained in:
commit
de2b995e69
@ -1786,6 +1786,9 @@ def test_get_element_html_by_class(self):
|
||||
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
|
||||
<div itemprop="author" itemscope>foo</div>
|
||||
'''
|
||||
VOID_ELEMENT_TEST_STRING = '''
|
||||
<img alt="foo" src="bar.png"><img alt="foobar" src="baz.jpg"><img alt="foo"/>
|
||||
'''
|
||||
|
||||
def test_get_element_by_attribute(self):
|
||||
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
|
||||
@ -1798,6 +1801,10 @@ def test_get_element_by_attribute(self):
|
||||
|
||||
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
|
||||
|
||||
html = self.VOID_ELEMENT_TEST_STRING
|
||||
|
||||
self.assertEqual(get_element_by_attribute('alt', 'foo', html), '')
|
||||
|
||||
def test_get_element_html_by_attribute(self):
|
||||
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
|
||||
|
||||
@ -1809,6 +1816,10 @@ def test_get_element_html_by_attribute(self):
|
||||
|
||||
self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
|
||||
|
||||
html = self.VOID_ELEMENT_TEST_STRING
|
||||
|
||||
self.assertEqual(get_element_html_by_attribute('alt', 'foo', html), '<img alt="foo" src="bar.png">')
|
||||
|
||||
GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
|
||||
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
|
||||
'''
|
||||
@ -1833,6 +1844,10 @@ def test_get_elements_by_attribute(self):
|
||||
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
|
||||
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
|
||||
|
||||
html = self.VOID_ELEMENT_TEST_STRING
|
||||
|
||||
self.assertEqual(get_elements_by_attribute('alt', 'foo', html), ['', ''])
|
||||
|
||||
def test_get_elements_html_by_attribute(self):
|
||||
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
||||
|
||||
@ -1840,6 +1855,11 @@ def test_get_elements_html_by_attribute(self):
|
||||
self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
|
||||
self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
|
||||
|
||||
html = self.VOID_ELEMENT_TEST_STRING
|
||||
|
||||
self.assertEqual(get_elements_html_by_attribute(
|
||||
'alt', 'foo', html), ['<img alt="foo" src="bar.png">', '<img alt="foo"/>'])
|
||||
|
||||
def test_get_elements_text_and_html_by_attribute(self):
|
||||
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
||||
|
||||
@ -1852,6 +1872,11 @@ def test_get_elements_text_and_html_by_attribute(self):
|
||||
self.assertEqual(list(get_elements_text_and_html_by_attribute(
|
||||
'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a')), [('nice', '<a class="foo">nice</a>')])
|
||||
|
||||
html = self.VOID_ELEMENT_TEST_STRING
|
||||
|
||||
self.assertEqual(list(get_elements_text_and_html_by_attribute(
|
||||
'alt', 'foo', html, tag='img')), [('', '<img alt="foo" src="bar.png">'), ('', '<img alt="foo"/>')])
|
||||
|
||||
GET_ELEMENT_BY_TAG_TEST_STRING = '''
|
||||
random text lorem ipsum</p>
|
||||
<div>
|
||||
@ -1880,6 +1905,10 @@ def test_get_element_text_and_html_by_tag(self):
|
||||
(self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
|
||||
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
|
||||
|
||||
html = self.VOID_ELEMENT_TEST_STRING
|
||||
|
||||
self.assertEqual(get_element_text_and_html_by_tag('img', html), ('', '<img alt="foo" src="bar.png">'))
|
||||
|
||||
def test_iri_to_uri(self):
|
||||
self.assertEqual(
|
||||
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
|
||||
|
@ -167,6 +167,12 @@ def IDENTITY(x):
|
||||
|
||||
NUMBER_RE = r'\d+(?:\.\d+)?'
|
||||
|
||||
VOID_ELEMENTS = [
|
||||
'area', 'base', 'br', 'col', 'embed',
|
||||
'hr', 'img', 'input', 'link', 'meta',
|
||||
'param', 'source', 'track', 'wbr',
|
||||
]
|
||||
|
||||
|
||||
@functools.cache
|
||||
def preferredencoding():
|
||||
@ -364,15 +370,13 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
|
||||
if not value:
|
||||
return
|
||||
|
||||
quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
|
||||
|
||||
value = re.escape(value) if escape_value else value
|
||||
|
||||
partial_element_re = rf'''(?x)
|
||||
<(?P<tag>{tag})
|
||||
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
|
||||
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
|
||||
'''
|
||||
(?:\s[^>"']*|"[^"]*"|'[^']*')*?
|
||||
\s{re.escape(attribute)}\s*=\s*(?P<q>['"])?(?-x:{value})(?(q)(?P=q)|(?=[\s/>]))
|
||||
'''
|
||||
|
||||
for m in re.finditer(partial_element_re, html):
|
||||
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
|
||||
@ -436,12 +440,17 @@ def find_or_raise(haystack, needle, exc):
|
||||
return haystack.index(needle)
|
||||
except ValueError:
|
||||
raise exc
|
||||
closing_tag = f'</{tag}>'
|
||||
|
||||
whole_start = find_or_raise(
|
||||
html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
|
||||
content_start = find_or_raise(
|
||||
html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
|
||||
content_start += whole_start + 1
|
||||
|
||||
if tag in VOID_ELEMENTS:
|
||||
return '', html[whole_start:content_start]
|
||||
|
||||
closing_tag = f'</{tag}>'
|
||||
with HTMLBreakOnClosingTagParser() as parser:
|
||||
parser.feed(html[whole_start:content_start])
|
||||
if not parser.tagstack or parser.tagstack[0] != tag:
|
||||
|
Loading…
Reference in New Issue
Block a user