diff --git a/test/test_utils.py b/test/test_utils.py index 022e821a6..d9a62258c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,14 @@ compat_HTMLParseError, compat_os_name, ) +from yt_dlp.parsing import ( + HTMLTagParser, + FirstMatchingElementParser, +) + +# some testcases don't work with current functions +get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag + from yt_dlp.utils import ( Config, DateRange, @@ -60,7 +68,6 @@ get_element_by_class, get_element_html_by_attribute, get_element_html_by_class, - get_element_text_and_html_by_tag, get_elements_by_attribute, get_elements_by_class, get_elements_html_by_attribute, @@ -1797,11 +1804,14 @@ def test_get_element_text_and_html_by_tag(self): self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) def test_get_element_text_and_html_by_tag_malformed(self): - inner_text = 'inner_text' + inner_text = 'inner text' malnested_elements = f'{inner_text}' - html = f'
{malnested_elements}
' + commented_html = '' + outerdiv_html = f'
{malnested_elements}
' + html = f'{commented_html}{outerdiv_html}' - self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html)) + self.assertEqual( + get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) self.assertEqual( get_element_text_and_html_by_tag('malnested_a', html), (f'{inner_text}', @@ -1815,6 +1825,61 @@ def test_get_element_text_and_html_by_tag_malformed(self): self.assertRaises( compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + def test_strict_html_parsing(self): + class StrictTagParser(HTMLTagParser): + STRICT = True + + parser = StrictTagParser() + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

/p>

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): + parser.taglist('must be empty', reset=True) + + def test_relaxed_html_parsing(self): + Tag = HTMLTagParser.Tag + parser = HTMLTagParser() + + self.assertEqual(parser.taglist('

', reset=True), []) + self.assertEqual(parser.taglist('

', reset=True), []) + + tags = parser.taglist('

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + + tags = parser.taglist('

/p>

', reset=True) + self.assertEqual(tags, [Tag('div')]) + + tags = parser.taglist('

paragraph

', reset=True) + self.assertEqual(tags, [Tag('p'), Tag('div')]) + self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraphmust be empty', reset=True) + self.assertEqual(tags, [Tag('img')]) + self.assertEqual(tags[0].text_and_html(), ('', '')) + + def test_compliant_html_parsing(self): + # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) + Tag = HTMLTagParser.Tag + html = ''' + no error without closing tag: + self closing is ok: + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual(tags, [Tag('img'), Tag('img')]) + + # don't get fooled by '>' in attributes + html = '''''' + tags = parser.taglist(html, reset=True) + self.assertEqual(tags[0].text_and_html(), ('', html)) + def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py new file mode 100644 index 000000000..d0dcf450a --- /dev/null +++ b/yt_dlp/parsing.py @@ -0,0 +1,219 @@ +import collections +import contextlib +import itertools +import re +from html.parser import HTMLParser + +from .utils import orderedSet + +from .compat import compat_HTMLParseError + + +class HTMLTagParser(HTMLParser): + """HTML parser which acts as iterator + returns found elements as instances of Tag + nested elements will be returned before its parents + + strict=True raises compat_HTMLParseError on malformed html + + two modes of usage: + # as an lazy iterator: + for tag_obj in HTMLTagParser(html): + tag_obj.text_and_html() + + # or return a list with all found tag objects + # this is faster by factor 2-5 compared to iteration + for tag_obj in HTMLTagParser(html).taglist(): + tag_obj.text_and_html() + """ + + STRICT = False + ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''') + CLOSING_TAG_REGEX = re.compile(r']+(?:\s*>)?') + VOID_TAGS = { + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', + } + + class Tag: + __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs' + + def __init__(self, name, *, string='', start=None, stop=None, attrs=()): + self.name = name + self.string = string + self.start = start + self.start_len = 0 + self.stop = stop + self.attrs = tuple(attrs) + + def __str__(self): + return self.name + + def __repr__(self): + return f'{self.__class__.__name__}({str(self)!r})' + + def __eq__(self, other): + return self.name == other + + def html(self): + return self.string[self.start:self.stop] + + def text_and_html(self): + assert isinstance(self.start, int) + if not self.start_len: + match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:]) + assert match + self.start_len = len(match.group()) + if self.stop is None: + return '', self.string[self.start: self.start + self.start_len] + html = self.html() + cidx = html.rindex('') or tag in self.VOID_TAGS: + if self.callback(obj) is not False: + self.found_tags.append(obj) + return + else: + obj = None + + self.tagstack.appendleft(obj or tag) + + handle_startendtag = handle_starttag + + def handle_endtag(self, tag): + if '<' in tag: + if self.STRICT: + raise compat_HTMLParseError(f'malformed closing tag {tag!r}') + tag = tag[:tag.index('<')] + + try: + idx = self.tagstack.index(tag) + if self.STRICT and idx: + open_tags = ''.join(f'' for tag in itertools.islice(self.tagstack, idx)) + raise compat_HTMLParseError( + f'malnested closing tag {tag!r}, expected after {open_tags!r}') + tag_obj = self.tagstack[idx] + self.tagstack.remove(tag) + if not isinstance(tag_obj, str): + # since we landed here we'll always find a closing tag + match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:]) + tag_obj.stop = self._offset + match.end() + if self.callback(tag_obj) is not False: + self.found_tags.append(tag_obj) + except ValueError as exc: + if isinstance(exc, compat_HTMLParseError): + raise + elif self.STRICT: + raise compat_HTMLParseError(f'stray closing tag {tag!r}') + + +class ClassParser(HTMLTagParser): + def __init__(self, attribute, matchfunc, stop): + super().__init__() + self.search_attr = attribute + self.matchfunc = matchfunc + self.stop = stop + self.processing = 0 + + def predicate(self, tag, attrs): + if self.processing <= 0 and self.stop is not None and self._offset > self.stop: + self.abort() + string = dict(attrs).get(self.search_attr, '') + if self.matchfunc(string): + self.processing += 1 + return True + return False + + def callback(self, tag_obj): + if self.stop is None: + self.abort(tag_obj) + self.processing -= 1 + + @classmethod + def get_elements_html_by_class(cls, class_name, html): + regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b') + it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html) + start = stop = None + for match in it: + if start is None: + start = match.start() + else: + stop = match.end() + if start is None: + return [] + parser = cls('class', lambda x: regex.match(x), stop) + return [tag.html() for tag in parser.taglist(html[start:])] + + +class FirstMatchingElementParser(HTMLTagParser): + def __init__(self, matchfunc): + super().__init__() + self.matchfunc = matchfunc + self.found = False + + def predicate(self, tag, attrs): + if not self.found and self.matchfunc(tag, attrs): + self.found = True + return True + return False + + def callback(self, obj): + self.abort(obj) + + @classmethod + def get_element_text_and_html_by_tag(cls, tag, html): + """ + For the first element with the specified tag in the given HTML document + return its content (text) and the whole element (html) + """ + parser = cls(lambda _tag, _: _tag == tag) + for tag_obj in parser.taglist(html): + return tag_obj.text_and_html() + raise compat_HTMLParseError(f'tag {tag} not found')