1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-07-01 19:08:32 +00:00
yt-dlp/yt_dlp/extractor/mave.py

113 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from .common import InfoExtractor
from ..utils import clean_html, int_or_none, parse_iso8601, urljoin
from ..utils.traversal import require, traverse_obj
class MaveIE(InfoExtractor):
_VALID_URL = r'https?://(?P<channel>[\w-]+)\.mave\.digital/(?P<id>ep-\d+)'
_TESTS = [{
'url': 'https://ochenlichnoe.mave.digital/ep-25',
'md5': 'aa3e513ef588b4366df1520657cbc10c',
'info_dict': {
'id': '4035f587-914b-44b6-aa5a-d76685ad9bc2',
'ext': 'mp3',
'display_id': 'ochenlichnoe-ep-25',
'title': 'Между мной и миром: психология самооценки',
'description': 'md5:4b7463baaccb6982f326bce5c700382a',
'uploader': 'Самарский университет',
'channel': 'Очень личное',
'channel_id': 'ochenlichnoe',
'channel_url': 'https://ochenlichnoe.mave.digital/',
'view_count': int,
'like_count': int,
'dislike_count': int,
'duration': 3744,
'thumbnail': r're:https://.*/storage/podcasts/2e0c3749-6df2-4946-82f4-50691419c065/images/f37be842-b1d8-425c-818c-21ebddf16032.jpg',
'series': 'Очень личное',
'series_id': '2e0c3749-6df2-4946-82f4-50691419c065',
'season': 'Season 3',
'season_number': 3,
'episode': 'Episode 3',
'episode_number': 3,
'timestamp': 1747817300,
'upload_date': '20250521',
},
}, {
'url': 'https://budem.mave.digital/ep-12',
'md5': 'e1ce2780fcdb6f17821aa3ca3e8c919f',
'info_dict': {
'id': '41898bb5-ff57-4797-9236-37a8e537aa21',
'ext': 'mp3',
'display_id': 'budem-ep-12',
'title': 'Екатерина Михайлова: "Горе от ума" не про женщин написана',
'description': 'md5:fa3bdd59ee829dfaf16e3efcb13f1d19',
'uploader': 'Полина Цветкова+Евгения Акопова',
'channel': 'Все там будем',
'channel_id': 'budem',
'channel_url': 'https://budem.mave.digital/',
'view_count': int,
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'duration': 3664,
'thumbnail': r're:https://.*/storage/podcasts/fe9347bf-c009-4ebd-87e8-b06f2f324746/images/985679d7-ccd7-4232-8fe4-5eafca1be190.jpg',
'series': 'Все там будем',
'series_id': 'fe9347bf-c009-4ebd-87e8-b06f2f324746',
'season': 'Season 2',
'season_number': 2,
'episode': 'Episode 5',
'episode_number': 5,
'timestamp': 1735538400,
'upload_date': '20241230',
},
}]
_API_BASE_URL = 'https://api.mave.digital/'
def _real_extract(self, url):
channel_id, slug = self._match_valid_url(url).group('channel', 'id')
display_id = f'{channel_id}-{slug}'
webpage = self._download_webpage(url, display_id)
data = traverse_obj(
self._search_nuxt_json(webpage, display_id),
('data', lambda _, v: v['activeEpisodeData'], any, {require('podcast data')}))
return {
'display_id': display_id,
'channel_id': channel_id,
'channel_url': f'https://{channel_id}.mave.digital/',
'vcodec': 'none',
'thumbnail': self._extract_thumbnail(webpage),
**traverse_obj(data, ('activeEpisodeData', {
'url': ('audio', {urljoin(self._API_BASE_URL)}),
'id': ('id', {str}),
'title': ('title', {str}),
'description': ('description', {clean_html}),
'duration': ('duration', {int_or_none}),
'season_number': ('season', {int_or_none}),
'episode_number': ('number', {int_or_none}),
'view_count': ('listenings', {int_or_none}),
'like_count': ('reactions', lambda _, v: v['type'] == 'like', 'count', {int_or_none}, any),
'dislike_count': ('reactions', lambda _, v: v['type'] == 'dislike', 'count', {int_or_none}, any),
'age_limit': ('is_explicit', {bool}, {lambda x: 18 if x else None}),
'timestamp': ('publish_date', {parse_iso8601}),
})),
**traverse_obj(data, ('podcast', 'podcast', {
'series_id': ('id', {str}),
'series': ('title', {str}),
'channel': ('title', {str}),
'uploader': ('author', {str}),
})),
}
def _extract_thumbnail(self, webpage):
# _API_BASE_URL could be used only to download mp3 file itself, not thumbnail.
# To get link to thumbnail we should use URL resolved to specific server
# E.g. https://ru-msk-dr3-1.store.cloud.mts.ru/, which could be loaded from og tags.
trimmed_thumbnail = self._og_search_thumbnail(webpage)
# To get link to base image, not trimmed one, remove postfix with resolution.
# E.g 123456789_600.jpg -> 123456789.jpg
return re.sub(r'_\d+(?=\.)', '', trimmed_thumbnail)