diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 32b4680b7..98e690365 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -30,7 +30,7 @@ from ..downloader.f4m import get_base_url, remove_encrypted_media from ..downloader.hls import HlsFD from ..globals import plugin_ies_overrides -from ..networking import HEADRequest, Request +from ..networking import HEADRequest, Request, Response from ..networking.exceptions import ( HTTPError, IncompleteRead, @@ -102,6 +102,7 @@ ) from ..utils._utils import _request_dump_filename from ..utils.jslib import devalue +from ..utils.networking import HTTPHeaderDict class InfoExtractor: @@ -571,6 +572,7 @@ class InfoExtractor: _ready = False _downloader = None _x_forwarded_for_ip = None + _latest_firefox_user_agent = None _GEO_BYPASS = True _GEO_COUNTRIES = None _GEO_IP_BLOCKS = None @@ -1202,6 +1204,52 @@ def _download_webpage( raise e self._sleep(timeout, video_id) + def _get_latest_firefox_user_agent(self, fatal=False): + if InfoExtractor._latest_firefox_user_agent: + return InfoExtractor._latest_firefox_user_agent + + USER_AGENT_TMPL = 'Mozilla/5.0 (Windows NT 10.0; rv:{0}.0) Gecko/20100101 Firefox/{0}.0' + DEFAULT_VERSION = '140' # If not fatal, default to latest major version as of 2025.06.24 + ff_version = None + + # Ref: https://ftp.mozilla.org/pub/firefox/releases/latest/README.txt + urlh = self._request_webpage( + HEADRequest('https://download.mozilla.org/'), None, + 'Fetching latest Firefox version number', 'Unable to fetch latest Firefox version number', + fatal=fatal, query={'product': 'firefox-latest', 'os': 'linux64', 'lang': 'en-US'}) + if isinstance(urlh, Response): + ff_version = self._search_regex( + r'/releases/(\d{3})', urlh.url, 'latest Firefox version number', fatal=fatal) + + if ff_version: + InfoExtractor._latest_firefox_user_agent = USER_AGENT_TMPL.format(ff_version) + return InfoExtractor._latest_firefox_user_agent + + self.write_debug(f'Using default Firefox {DEFAULT_VERSION} user-agent instead of latest') + return USER_AGENT_TMPL.format(DEFAULT_VERSION) + + def _download_firefox_webpage(self, url, video_id, *args, **kwargs): + impersonate = kwargs.pop('impersonate', None) + require_impersonation = kwargs.pop('require_impersonation', False) + headers = HTTPHeaderDict(kwargs.pop('headers', None)) + headers.update({'User-Agent': self._get_latest_firefox_user_agent()}) + kwargs.update({'headers': headers}) + + try: + return self._download_webpage(url, video_id, *args, **kwargs) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status != 403: + raise + self.write_debug(f'{video_id}: Got HTTP Error 403, retrying with impersonation') + + # Retry with impersonation if user-agent alone is insufficient + headers.pop('User-Agent') + kwargs.update({ + 'impersonate': impersonate if impersonate is not None else 'firefox', + 'require_impersonation': require_impersonation, + }) + return self._download_webpage(url, video_id, *args, **kwargs) + def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs): idstr = format_field(video_id, None, '%s: ') msg = f'[{self.IE_NAME}] {idstr}{msg}' diff --git a/yt_dlp/extractor/francaisfacile.py b/yt_dlp/extractor/francaisfacile.py index d3208c282..4e5167f39 100644 --- a/yt_dlp/extractor/francaisfacile.py +++ b/yt_dlp/extractor/francaisfacile.py @@ -1,9 +1,7 @@ import urllib.parse from .common import InfoExtractor -from ..networking.exceptions import HTTPError from ..utils import ( - ExtractorError, float_or_none, url_or_none, ) @@ -58,16 +56,8 @@ class FrancaisFacileIE(InfoExtractor): def _real_extract(self, url): display_id = urllib.parse.unquote(self._match_id(url)) - - try: # yt-dlp's default user-agents are too old and blocked by the site - webpage = self._download_webpage(url, display_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', - }) - except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or e.cause.status != 403: - raise - # Retry with impersonation if hardcoded UA is insufficient - webpage = self._download_webpage(url, display_id, impersonate=True) + # yt-dlp's default Chrome user-agents are too old and blocked by the website + webpage = self._download_firefox_webpage(url, display_id, impersonate=True) data = self._search_json( r']+\bdata-media-id=[^>]+\btype="application/json"[^>]*>', diff --git a/yt_dlp/extractor/mitele.py b/yt_dlp/extractor/mitele.py index 0dded38c6..a26e18a4a 100644 --- a/yt_dlp/extractor/mitele.py +++ b/yt_dlp/extractor/mitele.py @@ -79,7 +79,8 @@ class MiTeleIE(TelecincoBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_akamai_webpage(url, display_id) + # yt-dlp's default Chrome user-agents are too old and blocked by akamai + webpage = self._download_firefox_webpage(url, display_id, impersonate=True) pre_player = self._search_json( r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=', webpage, 'Pre Player', display_id)['prePlayer'] diff --git a/yt_dlp/extractor/sproutvideo.py b/yt_dlp/extractor/sproutvideo.py index 764c78f1e..106a09aac 100644 --- a/yt_dlp/extractor/sproutvideo.py +++ b/yt_dlp/extractor/sproutvideo.py @@ -98,11 +98,8 @@ def _extract_embed_urls(cls, url, webpage): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, headers={ - **traverse_obj(smuggled_data, {'Referer': 'referer'}), - # yt-dlp's default Chrome user-agents are too old - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:140.0) Gecko/20100101 Firefox/140.0', - }) + webpage = self._download_firefox_webpage( + url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'})) data = self._search_json( r'var\s+(?:dat|playerInfo)\s*=\s*["\']', webpage, 'player info', video_id, contains_pattern=r'[A-Za-z0-9+/=]+', end_pattern=r'["\'];', diff --git a/yt_dlp/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index 2dbe2a776..45d368f71 100644 --- a/yt_dlp/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py @@ -63,17 +63,6 @@ def _parse_content(self, content, url): 'http_headers': headers, } - def _download_akamai_webpage(self, url, display_id): - try: # yt-dlp's default user-agents are too old and blocked by akamai - return self._download_webpage(url, display_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', - }) - except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or e.cause.status != 403: - raise - # Retry with impersonation if hardcoded UA is insufficient to bypass akamai - return self._download_webpage(url, display_id, impersonate=True) - class TelecincoIE(TelecincoBaseIE): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' @@ -151,7 +140,8 @@ class TelecincoIE(TelecincoBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_akamai_webpage(url, display_id) + # yt-dlp's default Chrome user-agents are too old and blocked by akamai + webpage = self._download_firefox_webpage(url, display_id, impersonate=True) article = self._search_json( r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=', webpage, 'article', display_id)['article']