', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
+ parser.taglist('
must be empty', reset=True)
+
+ def test_relaxed_html_parsing(self):
+ Tag = HTMLTagParser.Tag
+ parser = HTMLTagParser()
+
+ self.assertEqual(parser.taglist('', reset=True), [])
+ self.assertEqual(parser.taglist('
', reset=True), [])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('p'), Tag('div')])
+ self.assertEqual(tags[0].text_and_html(), ('paragraph', '
paragraph
must be empty', reset=True)
+ self.assertEqual(tags, [Tag('img')])
+ self.assertEqual(tags[0].text_and_html(), ('', '
![]()
'))
+
+ def test_compliant_html_parsing(self):
+ # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
+ Tag = HTMLTagParser.Tag
+ html = '''
+ no error without closing tag:
![]()
+ self closing is ok:
![]()
+ '''
+ parser = HTMLTagParser()
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags, [Tag('img'), Tag('img')])
+
+ # don't get fooled by '>' in attributes
+ html = '''
![]()
'''
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags[0].text_and_html(), ('', html))
diff --git a/test/test_utils.py b/test/test_utils.py
index d9a62258c..3045b6d7e 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,7 +4,6 @@
import os
import re
import sys
-import textwrap
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -21,14 +20,6 @@
compat_HTMLParseError,
compat_os_name,
)
-from yt_dlp.parsing import (
- HTMLTagParser,
- FirstMatchingElementParser,
-)
-
-# some testcases don't work with current functions
-get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
-
from yt_dlp.utils import (
Config,
DateRange,
@@ -68,6 +59,7 @@
get_element_by_class,
get_element_html_by_attribute,
get_element_html_by_class,
+ get_element_text_and_html_by_tag,
get_elements_by_attribute,
get_elements_by_class,
get_elements_html_by_attribute,
@@ -1776,110 +1768,34 @@ def test_get_elements_text_and_html_by_attribute(self):
self.assertEqual(list(get_elements_text_and_html_by_attribute(
'class', 'foo', '
nicenice', tag='a')), [('nice', '
nice')])
- def test_get_element_text_and_html_by_tag(self):
- get_element_by_tag_test_string = '''
- random text lorem ipsum
+ GET_ELEMENT_BY_TAG_TEST_STRING = '''
+ random text lorem ipsum
+
+ this should be returned
+
this should also be returned
- this should be returned
-
this should also be returned
-
- this should also be returned
-
- closing tag above should not trick, so this should also be returned
+ this should also be returned
- but this text should not be returned
- '''
- html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
- get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
- get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
- get_element_by_tag_res_innerspan_html = html.strip()[78:119]
- get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
+ closing tag above should not trick, so this should also be returned
+
+ but this text should not be returned
+ '''
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
+
+ def test_get_element_text_and_html_by_tag(self):
+ html = self.GET_ELEMENT_BY_TAG_TEST_STRING
self.assertEqual(
get_element_text_and_html_by_tag('div', html),
- (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
+ (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
self.assertEqual(
get_element_text_and_html_by_tag('span', html),
- (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
+ (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
- def test_get_element_text_and_html_by_tag_malformed(self):
- inner_text = 'inner text'
- malnested_elements = f'
{inner_text}'
- commented_html = ''
- outerdiv_html = f'
{malnested_elements}
'
- html = f'{commented_html}{outerdiv_html}'
-
- self.assertEqual(
- get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
- self.assertEqual(
- get_element_text_and_html_by_tag('malnested_a', html),
- (f'
{inner_text}',
- f'{inner_text}'))
- self.assertEqual(
- get_element_text_and_html_by_tag('malnested_b', html),
- (f'{inner_text}',
- f'{inner_text}'))
- self.assertRaises(
- compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
- self.assertRaises(
- compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
-
- def test_strict_html_parsing(self):
- class StrictTagParser(HTMLTagParser):
- STRICT = True
-
- parser = StrictTagParser()
- with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
- parser.taglist('', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
- parser.taglist('', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '
'"):
- parser.taglist('
', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after ''"):
- parser.taglist('
', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
- parser.taglist('
', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
- parser.taglist('
![]()
must be empty', reset=True)
-
- def test_relaxed_html_parsing(self):
- Tag = HTMLTagParser.Tag
- parser = HTMLTagParser()
-
- self.assertEqual(parser.taglist('', reset=True), [])
- self.assertEqual(parser.taglist('
', reset=True), [])
-
- tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('div'), Tag('p')])
-
- tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('div')])
-
- tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('p'), Tag('div')])
- self.assertEqual(tags[0].text_and_html(), ('paragraph', '
paragraph
must be empty', reset=True)
- self.assertEqual(tags, [Tag('img')])
- self.assertEqual(tags[0].text_and_html(), ('', '
![]()
'))
-
- def test_compliant_html_parsing(self):
- # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
- Tag = HTMLTagParser.Tag
- html = '''
- no error without closing tag:
![]()
- self closing is ok:
![]()
- '''
- parser = HTMLTagParser()
- tags = parser.taglist(html, reset=True)
- self.assertEqual(tags, [Tag('img'), Tag('img')])
-
- # don't get fooled by '>' in attributes
- html = '''
![]()
'''
- tags = parser.taglist(html, reset=True)
- self.assertEqual(tags[0].text_and_html(), ('', html))
-
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),