From 4e4325e343ccad4ffc605d4821886b7beab8e4a1 Mon Sep 17 00:00:00 2001 From: Anton Larionov <11796525+anlar@users.noreply.github.com> Date: Sun, 15 Jun 2025 15:39:32 +0200 Subject: [PATCH] [ie/mave] Add extractor for Mave Digital (code review fixes from #13380) --- yt_dlp/extractor/mave.py | 161 ++++++++++++++++++++++----------------- 1 file changed, 89 insertions(+), 72 deletions(-) diff --git a/yt_dlp/extractor/mave.py b/yt_dlp/extractor/mave.py index 42f77408c..1c01f5bd0 100644 --- a/yt_dlp/extractor/mave.py +++ b/yt_dlp/extractor/mave.py @@ -1,84 +1,101 @@ -import json -import re - from .common import InfoExtractor -from ..utils import ( - ExtractorError, - get_element_by_id, -) +from ..utils import clean_html, int_or_none, parse_iso8601, urljoin +from ..utils.traversal import require, traverse_obj class MaveIE(InfoExtractor): - _VALID_URL = r'https?://(?P[a-z]+)\.mave\.digital/ep-(?P[0-9]+)' - _TESTS = [ - { - 'url': 'https://ochenlichnoe.mave.digital/ep-25', - 'md5': 'aa3e513ef588b4366df1520657cbc10c', - 'info_dict': { - 'id': 'ochenlichnoe-25', - 'title': 'Между мной и миром: психология самооценки', - 'description': 'md5:83183d7002dc32fbebc3ccecd4a1ac03', - 'thumbnail': r're:https?://.*\.jpg$', - 'ext': 'mp3', - 'channel': 'Очень личное', - 'channel_id': 'ochenlichnoe', - 'channel_url': 'https://ochenlichnoe.mave.digital/', - }, + _VALID_URL = r'https?://(?P[\w-]+)\.mave\.digital/(?Pep-\d+)' + _TESTS = [{ + 'url': 'https://ochenlichnoe.mave.digital/ep-25', + 'md5': 'aa3e513ef588b4366df1520657cbc10c', + 'info_dict': { + 'id': '4035f587-914b-44b6-aa5a-d76685ad9bc2', + 'ext': 'mp3', + 'display_id': 'ochenlichnoe-ep-25', + 'title': 'Между мной и миром: психология самооценки', + 'description': 'md5:4b7463baaccb6982f326bce5c700382a', + 'uploader': 'Самарский университет', + 'channel': 'Очень личное', + 'channel_id': 'ochenlichnoe', + 'channel_url': 'https://ochenlichnoe.mave.digital/', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'duration': 3744, + 'thumbnail': 'https://api.mave.digital/storage/podcasts/2e0c3749-6df2-4946-82f4-50691419c065/images/f37be842-b1d8-425c-818c-21ebddf16032.jpg', + 'series': 'Очень личное', + 'series_id': '2e0c3749-6df2-4946-82f4-50691419c065', + 'season': 'Season 3', + 'season_number': 3, + 'episode': 'Episode 3', + 'episode_number': 3, + 'timestamp': 1747817300, + 'upload_date': '20250521', }, - { - 'url': 'https://budem.mave.digital/ep-12', - 'md5': 'e1ce2780fcdb6f17821aa3ca3e8c919f', - 'info_dict': { - 'id': 'budem-12', - 'title': 'Екатерина Михайлова: "Горе от ума" не про женщин написана', - 'description': 'md5:d9ce1fc1fb5fc7b7a4e7a0b84a7861c3', - 'thumbnail': r're:https?://.*\.jpg$', - 'ext': 'mp3', - 'channel': 'Все там будем', - 'channel_id': 'budem', - 'channel_url': 'https://budem.mave.digital/', - }, + }, { + 'url': 'https://budem.mave.digital/ep-12', + 'md5': 'e1ce2780fcdb6f17821aa3ca3e8c919f', + 'info_dict': { + 'id': '41898bb5-ff57-4797-9236-37a8e537aa21', + 'ext': 'mp3', + 'display_id': 'budem-ep-12', + 'title': 'Екатерина Михайлова: "Горе от ума" не про женщин написана', + 'description': 'md5:fa3bdd59ee829dfaf16e3efcb13f1d19', + 'uploader': 'Полина Цветкова+Евгения Акопова', + 'channel': 'Все там будем', + 'channel_id': 'budem', + 'channel_url': 'https://budem.mave.digital/', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + 'duration': 3664, + 'thumbnail': 'https://api.mave.digital/storage/podcasts/fe9347bf-c009-4ebd-87e8-b06f2f324746/images/985679d7-ccd7-4232-8fe4-5eafca1be190.jpg', + 'series': 'Все там будем', + 'series_id': 'fe9347bf-c009-4ebd-87e8-b06f2f324746', + 'season': 'Season 2', + 'season_number': 2, + 'episode': 'Episode 5', + 'episode_number': 5, + 'timestamp': 1735538400, + 'upload_date': '20241230', }, - ] + }] + + _API_BASE_URL = 'https://api.mave.digital/' def _real_extract(self, url): - channel_id, short_id = self._match_valid_url(url).group('channel_id', 'short_id') - - channel_url = f'https://{channel_id}.mave.digital/' - - video_id = f'{channel_id}-{short_id}' - - webpage = self._download_webpage(url, video_id) - - # Format: "TITLE — Подкаст «CHANNEL»" - page_title = self._html_search_regex(r'(.+?)', webpage, 'title') - match = re.search(r'^(.+?)\s*—\s*(.+?)«(.+?)»', page_title) - title = match.group(1).strip() - channel = match.group(3).strip() + channel_id, slug = self._match_valid_url(url).group('channel', 'id') + display_id = f'{channel_id}-{slug}' + webpage = self._download_webpage(url, display_id) + data = traverse_obj( + self._search_nuxt_json(webpage, display_id), + ('data', lambda _, v: v['activeEpisodeData'], any, {require('podcast data')})) return { - 'id': video_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'channel': channel, + 'display_id': display_id, 'channel_id': channel_id, - 'channel_url': channel_url, - 'url': self._mave_link(webpage, video_id), - 'thumbnail': self._og_search_thumbnail(webpage), + 'channel_url': f'https://{channel_id}.mave.digital/', + 'vcodec': 'none', + **traverse_obj(data, ('activeEpisodeData', { + 'url': ('audio', {urljoin(self._API_BASE_URL)}), + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('description', {clean_html}), + 'thumbnail': ('image', {urljoin(self._API_BASE_URL)}), + 'duration': ('duration', {int_or_none}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('number', {int_or_none}), + 'view_count': ('listenings', {int_or_none}), + 'like_count': ('reactions', lambda _, v: v['type'] == 'like', 'count', {int_or_none}, any), + 'dislike_count': ('reactions', lambda _, v: v['type'] == 'dislike', 'count', {int_or_none}, any), + 'age_limit': ('is_explicit', {bool}, {lambda x: 18 if x else None}), + 'timestamp': ('publish_date', {parse_iso8601}), + })), + **traverse_obj(data, ('podcast', 'podcast', { + 'series_id': ('id', {str}), + 'series': ('title', {str}), + 'channel': ('title', {str}), + 'uploader': ('author', {str}), + })), } - - def _mave_link(self, webpage, video_id): - data = get_element_by_id('__NUXT_DATA__', webpage) - - jdata = json.loads(data) - - for value in jdata: - if isinstance(value, str): - if value.endswith('.mp3'): - link_id = value - break - - if link_id is None: - raise ExtractorError('Unable to find mp3 file link', video_id=video_id) - - return 'https://api.mave.digital/' + link_id