', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
+ parser.taglist('
must be empty', reset=True)
+
+ def test_relaxed_html_parsing(self):
+ Tag = HTMLTagParser.Tag
+ parser = HTMLTagParser()
+
+ self.assertEqual(parser.taglist('', reset=True), [])
+ self.assertEqual(parser.taglist('
', reset=True), [])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('p'), Tag('div')])
+ self.assertEqual(tags[0].text_and_html(), ('paragraph', '
paragraph
must be empty', reset=True)
+ self.assertEqual(tags, [Tag('img')])
+ self.assertEqual(tags[0].text_and_html(), ('', '
![]()
'))
+
+ def test_compliant_html_parsing(self):
+ # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
+ Tag = HTMLTagParser.Tag
+ html = '''
+ no error without closing tag:
![]()
+ self closing is ok:
![]()
+ '''
+ parser = HTMLTagParser()
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags, [Tag('img'), Tag('img')])
+
+ # don't get fooled by '>' in attributes
+ html = '''
![]()
'''
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags[0].text_and_html(), ('', html))
+
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
new file mode 100644
index 000000000..d0dcf450a
--- /dev/null
+++ b/yt_dlp/parsing.py
@@ -0,0 +1,219 @@
+import collections
+import contextlib
+import itertools
+import re
+from html.parser import HTMLParser
+
+from .utils import orderedSet
+
+from .compat import compat_HTMLParseError
+
+
+class HTMLTagParser(HTMLParser):
+ """HTML parser which acts as iterator
+ returns found elements as instances of Tag
+ nested elements will be returned before its parents
+
+ strict=True raises compat_HTMLParseError on malformed html
+
+ two modes of usage:
+ # as an lazy iterator:
+ for tag_obj in HTMLTagParser(html):
+ tag_obj.text_and_html()
+
+ # or return a list with all found tag objects
+ # this is faster by factor 2-5 compared to iteration
+ for tag_obj in HTMLTagParser(html).taglist():
+ tag_obj.text_and_html()
+ """
+
+ STRICT = False
+ ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
+ CLOSING_TAG_REGEX = re.compile(r'\s*[^\s<>]+(?:\s*>)?')
+ VOID_TAGS = {
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+ 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
+ }
+
+ class Tag:
+ __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs'
+
+ def __init__(self, name, *, string='', start=None, stop=None, attrs=()):
+ self.name = name
+ self.string = string
+ self.start = start
+ self.start_len = 0
+ self.stop = stop
+ self.attrs = tuple(attrs)
+
+ def __str__(self):
+ return self.name
+
+ def __repr__(self):
+ return f'{self.__class__.__name__}({str(self)!r})'
+
+ def __eq__(self, other):
+ return self.name == other
+
+ def html(self):
+ return self.string[self.start:self.stop]
+
+ def text_and_html(self):
+ assert isinstance(self.start, int)
+ if not self.start_len:
+ match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:])
+ assert match
+ self.start_len = len(match.group())
+ if self.stop is None:
+ return '', self.string[self.start: self.start + self.start_len]
+ html = self.html()
+ cidx = html.rindex('')
+ return html[self.start_len:cidx], html
+
+ class EarlyExitException(Exception):
+ pass
+
+ def __init__(self):
+ super().__init__()
+ self.tagstack = collections.deque()
+ self._offset = self.offset
+ self.found_tags = []
+
+ def predicate(self, tag, attrs):
+ return True
+
+ def callback(self, tag_obj):
+ pass
+
+ def abort(self, last_tag=None):
+ if last_tag:
+ self.found_tags.append(last_tag)
+ raise HTMLTagParser.EarlyExitException()
+
+ def taglist(self, data, reset=True):
+ self.found_tags.clear()
+ if reset:
+ self.reset()
+ self.tagstack.clear()
+ with contextlib.suppress(HTMLTagParser.EarlyExitException):
+ self.feed(data)
+ if self.STRICT and self.tagstack:
+ orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
+ raise compat_HTMLParseError(f'unclosed tag {orphans}')
+ return self.found_tags
+
+ def updatepos(self, i, j):
+ offset = self._offset = super().updatepos(i, j)
+ return offset
+
+ def handle_starttag(self, tag, attrs):
+ try:
+ # we use internal variable for performance reason
+ tag_text = getattr(self, '_HTMLParser__starttag_text')
+ except AttributeError:
+ tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
+ if self.predicate(tag, attrs):
+ obj = self.Tag(
+ tag, string=self.rawdata, start=self._offset, attrs=attrs)
+ obj.start_len = len(tag_text)
+ if tag_text.endswith('/>') or tag in self.VOID_TAGS:
+ if self.callback(obj) is not False:
+ self.found_tags.append(obj)
+ return
+ else:
+ obj = None
+
+ self.tagstack.appendleft(obj or tag)
+
+ handle_startendtag = handle_starttag
+
+ def handle_endtag(self, tag):
+ if '<' in tag:
+ if self.STRICT:
+ raise compat_HTMLParseError(f'malformed closing tag {tag!r}')
+ tag = tag[:tag.index('<')]
+
+ try:
+ idx = self.tagstack.index(tag)
+ if self.STRICT and idx:
+ open_tags = ''.join(f'{tag}>' for tag in itertools.islice(self.tagstack, idx))
+ raise compat_HTMLParseError(
+ f'malnested closing tag {tag!r}, expected after {open_tags!r}')
+ tag_obj = self.tagstack[idx]
+ self.tagstack.remove(tag)
+ if not isinstance(tag_obj, str):
+ # since we landed here we'll always find a closing tag
+ match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:])
+ tag_obj.stop = self._offset + match.end()
+ if self.callback(tag_obj) is not False:
+ self.found_tags.append(tag_obj)
+ except ValueError as exc:
+ if isinstance(exc, compat_HTMLParseError):
+ raise
+ elif self.STRICT:
+ raise compat_HTMLParseError(f'stray closing tag {tag!r}')
+
+
+class ClassParser(HTMLTagParser):
+ def __init__(self, attribute, matchfunc, stop):
+ super().__init__()
+ self.search_attr = attribute
+ self.matchfunc = matchfunc
+ self.stop = stop
+ self.processing = 0
+
+ def predicate(self, tag, attrs):
+ if self.processing <= 0 and self.stop is not None and self._offset > self.stop:
+ self.abort()
+ string = dict(attrs).get(self.search_attr, '')
+ if self.matchfunc(string):
+ self.processing += 1
+ return True
+ return False
+
+ def callback(self, tag_obj):
+ if self.stop is None:
+ self.abort(tag_obj)
+ self.processing -= 1
+
+ @classmethod
+ def get_elements_html_by_class(cls, class_name, html):
+ regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b')
+ it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html)
+ start = stop = None
+ for match in it:
+ if start is None:
+ start = match.start()
+ else:
+ stop = match.end()
+ if start is None:
+ return []
+ parser = cls('class', lambda x: regex.match(x), stop)
+ return [tag.html() for tag in parser.taglist(html[start:])]
+
+
+class FirstMatchingElementParser(HTMLTagParser):
+ def __init__(self, matchfunc):
+ super().__init__()
+ self.matchfunc = matchfunc
+ self.found = False
+
+ def predicate(self, tag, attrs):
+ if not self.found and self.matchfunc(tag, attrs):
+ self.found = True
+ return True
+ return False
+
+ def callback(self, obj):
+ self.abort(obj)
+
+ @classmethod
+ def get_element_text_and_html_by_tag(cls, tag, html):
+ """
+ For the first element with the specified tag in the given HTML document
+ return its content (text) and the whole element (html)
+ """
+ parser = cls(lambda _tag, _: _tag == tag)
+ for tag_obj in parser.taglist(html):
+ return tag_obj.text_and_html()
+ raise compat_HTMLParseError(f'tag {tag} not found')