diff --git a/yt_dlp/extractor/gedidigital.py b/yt_dlp/extractor/gedidigital.py index 3b65419c3..532589629 100644 --- a/yt_dlp/extractor/gedidigital.py +++ b/yt_dlp/extractor/gedidigital.py @@ -1,9 +1,12 @@ from .common import InfoExtractor from ..utils import ( + base_url, int_or_none, join_nonempty, js_to_json, mimetype2ext, + url_basename, + urljoin, ) @@ -14,7 +17,13 @@ class GediDigitalIE(InfoExtractor): |lastampa |ilsecoloxix |huffingtonpost - )\.it/[^?]+(?:/video/(?P[a-z0-9_-]+)-|/)(?P\d+))''' + )\.it/[^?]+(?:/video/(?P[a-z0-9_-]+)-|/)(?P\d+)[?&]?.*)''' + _EMBED_REGEX = [rf'''(?x) + (?: + data-frame-src=| + {_VALID_URL})\1'''] _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '6d1238ab5f4753b6f3d9eb396bff8ea3', @@ -53,6 +62,21 @@ class GediDigitalIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _sanitize_urls(urls): + # add protocol if missing + for i, e in enumerate(urls): + if e.startswith('//'): + urls[i] = f'https:{e}' + # clean iframes urls + for i, e in enumerate(urls): + urls[i] = urljoin(base_url(e), url_basename(e)) + return urls + + @classmethod + def _extract_embed_urls(cls, url, webpage): + return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage))) + def _real_extract(self, url): video_id, slug = self._match_valid_url(url).group('id', 'slug') webpage = self._download_webpage(url, video_id)