From 60b582751d6753c0aa6e05e47fac3af5b5d94349 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Fri, 1 Aug 2025 22:19:32 +0300 Subject: [PATCH] Update and fix smotrim.ru extractor (see description) - Added support for links with 'channel' type - Extract json info from api's json dictionary - Added thumbnail url - Fixed outdated broken links in _TESTS - Get many video formats from m3u8 playlist --- yt_dlp/extractor/smotrim.py | 163 ++++++++++++++++++++++++------------ 1 file changed, 108 insertions(+), 55 deletions(-) diff --git a/yt_dlp/extractor/smotrim.py b/yt_dlp/extractor/smotrim.py index d3f1b695b..0ab6159c0 100644 --- a/yt_dlp/extractor/smotrim.py +++ b/yt_dlp/extractor/smotrim.py @@ -1,65 +1,118 @@ -from .common import InfoExtractor -from ..utils import ExtractorError +import re + +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.utils import ExtractorError class SmotrimIE(InfoExtractor): - _VALID_URL = r'https?://smotrim\.ru/(?Pbrand|video|article|live)/(?P[0-9]+)' - _TESTS = [{ # video - 'url': 'https://smotrim.ru/video/1539617', - 'md5': 'b1923a533c8cab09679789d720d0b1c5', - 'info_dict': { - 'id': '1539617', - 'ext': 'mp4', - 'title': 'Полиглот. Китайский с нуля за 16 часов! Урок №16', - 'description': '', + _VALID_URL = r"https?://smotrim\.ru/(?Pbrand|video|article|live|channel)/(?P[0-9]+)" + _TESTS = [ + { + "url": "https://smotrim.ru/video/3003613", + "info_dict": { + "id": "3003613", + "ext": "mp4", + "title": "Погода. на 2 августа 2025 года", + "thumbnail": "https://cdn-st2.smotrim.ru/vh/pictures/xw/635/545/7.jpg", + }, + "add_ie": ["RUTV"], }, - 'add_ie': ['RUTV'], - }, { # article (geo-restricted? plays fine from the US and JP) - 'url': 'https://smotrim.ru/article/2813445', - 'md5': 'e0ac453952afbc6a2742e850b4dc8e77', - 'info_dict': { - 'id': '2431846', - 'ext': 'mp4', - 'title': 'Новости культуры. Съёмки первой программы "Большие и маленькие"', - 'description': 'md5:94a4a22472da4252bf5587a4ee441b99', + { + "url": "https://smotrim.ru/article/4609632", + "info_dict": { + "id": "3000761", + "ext": "mp4", + "title": "Новости культуры. Ольга Любимова провела рабочую встречу в Еврейской автономной области", + "thumbnail": "https://cdn-st2.smotrim.ru/vh/pictures/xw/638/386/6.jpg", + }, + "add_ie": ["RUTV"], }, - 'add_ie': ['RUTV'], - }, { # brand, redirect - 'url': 'https://smotrim.ru/brand/64356', - 'md5': '740472999ccff81d7f6df79cecd91c18', - 'info_dict': { - 'id': '2354523', - 'ext': 'mp4', - 'title': 'Большие и маленькие. Лучшее. 4-й выпуск', - 'description': 'md5:84089e834429008371ea41ea3507b989', + { + "url": "https://smotrim.ru/brand/64356", + "info_dict": { + "id": "2885093", + "ext": "mp4", + "title": "Большие и маленькие. 6-й сезон 8-й выпуск", + "thumbnail": "https://cdn-st2.smotrim.ru/vh/pictures/xw/527/187/3.jpg", + }, + "add_ie": ["RUTV"], }, - 'add_ie': ['RUTV'], - }, { # live - 'url': 'https://smotrim.ru/live/19201', - 'info_dict': { - 'id': '19201', - 'ext': 'mp4', - # this looks like a TV channel name - 'title': 'Россия Культура. Прямой эфир', - 'description': '', + { # GEO RESTRICTED + "url": "https://smotrim.ru/live/19201", + "info_dict": { + "id": "381308c7-a066-4c4f-9656-83e2e792a7b4", + "ext": "mp4", + "title": "Россия К", + "thumbnail": "https://cdn-st2.smotrim.ru/vh/pictures/xw/441/085/7.png", + }, + "add_ie": ["RUTV"], }, - 'add_ie': ['RUTV'], - }] + { # GEO RESTRICTED, REDIRECT FROM live, CANONICAL FOR live urls + "url": "https://smotrim.ru/channel/4", + "info_dict": { + "id": "4", + "ext": "mp4", + "title": "Россия К", + "thumbnail": "https://cdn-st2.smotrim.ru/vh/pictures/xw/441/085/7.png", + }, + "add_ie": ["RUTV"], + }, + ] def _real_extract(self, url): - video_id, typ = self._match_valid_url(url).group('id', 'type') - rutv_type = 'video' - if typ not in ('video', 'live'): - webpage = self._download_webpage(url, video_id, f'Resolving {typ} link') - # there are two cases matching regex: - # 1. "embedUrl" in JSON LD (/brand/) - # 2. "src" attribute from iframe (/article/) - video_id = self._search_regex( - r'"https://player.smotrim.ru/iframe/video/id/(?P\d+)/', - webpage, 'video_id', default=None) + video_id, type = self._match_valid_url(url).group("id", "type") + webpage = self._download_webpage(url, video_id, f"Resolving {type} link") + iframe_url = self._search_regex( + r']+\bsrc=["\'](https?://player\.smotrim\.ru/iframe/[^"\']+)', + webpage, + "iframe URL", + ) + if type in {"live", "channel"}: + # iframe_url = "https://player.smotrim.ru/iframe/live/uid/381308c7-a066-4c4f-9656-83e2e792a7b4/showZoomBtn/false/isPlay/true/mute/true/sid/smotrim_rk/" + video_id = re.search( + r"(?P[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})", + iframe_url, + ) if not video_id: - raise ExtractorError('There are no video in this page.', expected=True) - elif typ == 'live': - rutv_type = 'live' - - return self.url_result(f'https://player.vgtrk.com/iframe/{rutv_type}/id/{video_id}') + raise ExtractorError( + "There are no player uuid in this page.", expected=True + ) + video_id = video_id.group("video_id") + jsondata_url = ( + f"https://player.smotrim.ru/iframe/datalive/uid/{video_id}/sid/smotrim" + ) + else: + # iframe_url = "https://player.smotrim.ru/iframe/video/id/3000761/sid/smotrim/isPlay/true/mute/true/?acc_video_id=3204061" + video_id = re.search( + r"^https?://player\.smotrim\.ru/iframe/video/id/(?P\d+)/sid/", + iframe_url, + ) + if not video_id: + raise ExtractorError( + "There are no player id in this page.", expected=True + ) + video_id = video_id.group("video_id") + jsondata_url = ( + f"https://player.smotrim.ru/iframe/datavideo/id/{video_id}/sid/smotrim" + ) + try: + json_info = self._download_json( + jsondata_url, video_id, "Downloading player config JSON metadata" + ) + except Exception as e: + raise ExtractorError(str(e), expected=True) + m3u8_url = json_info["data"]["playlist"]["medialist"][0]["sources"]["m3u8"][ + "auto" + ] + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, "mp4", m3u8_id="hls" + ) + return { + "id": video_id, + "title": json_info["data"]["playlist"]["medialist"][0]["title"], + "thumbnail": json_info["data"]["playlist"]["medialist"][0]["pictures"][ + "16:9" + ], + "formats": formats, + "subtitles": subtitles, + }