diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b060db51a3..b35aab11c0 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2360,7 +2360,11 @@ from .voicy import ( VoicyChannelIE, VoicyIE, ) -from .volejtv import VolejTVIE +from .volejtv import ( + VolejTVCategoryPlaylistIE, + VolejTVClubPlaylistIE, + VolejTVIE, +) from .voxmedia import ( VoxMediaIE, VoxMediaVolumeIE, diff --git a/yt_dlp/extractor/volejtv.py b/yt_dlp/extractor/volejtv.py index 42ef9b1286..a71d33212c 100644 --- a/yt_dlp/extractor/volejtv.py +++ b/yt_dlp/extractor/volejtv.py @@ -1,40 +1,167 @@ +import functools + from .common import InfoExtractor +from ..utils import ( + InAdvancePagedList, + int_or_none, + join_nonempty, + orderedSet, + str_or_none, + strftime_or_none, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import ( + require, + traverse_obj, +) -class VolejTVIE(InfoExtractor): - _VALID_URL = r'https?://volej\.tv/video/(?P\d+)' +class VolejTVBaseIE(InfoExtractor): + TBR_HEIGHT_MAPPING = { + '6000': 1080, + '2400': 720, + '1500': 480, + '800': 360, + } + + def _call_api(self, endpoint, display_id, query=None): + return self._download_json( + f'https://api-volejtv-prod.apps.okd4.devopsie.cloud/api/{endpoint}', + display_id, query=query) + + +class VolejTVIE(VolejTVBaseIE): + IE_NAME = 'volejtv:match' + _VALID_URL = r'https?://volej\.tv/match/(?P\d+)' _TESTS = [{ - 'url': 'https://volej.tv/video/725742/', + 'url': 'https://volej.tv/match/270579', 'info_dict': { - 'id': '725742', + 'id': '270579', 'ext': 'mp4', - 'description': 'Zápas VK Královo Pole vs VK Prostějov 10.12.2022 v 19:00 na Volej.TV', - 'thumbnail': 'https://volej.tv/images/og/16/17186/og.png', - 'title': 'VK Královo Pole vs VK Prostějov', + 'title': 'SWE-CZE (2024-06-16)', + 'categories': ['ženy'], + 'series': 'ZLATÁ EVROPSKÁ VOLEJBALOVÁ LIGA', + 'season': '2023-2024', + 'timestamp': 1718553600, + 'upload_date': '20240616', }, }, { - 'url': 'https://volej.tv/video/725605/', + 'url': 'https://volej.tv/match/487520', 'info_dict': { - 'id': '725605', + 'id': '487520', 'ext': 'mp4', - 'thumbnail': 'https://volej.tv/images/og/15/17185/og.png', - 'title': 'VK Lvi Praha vs VK Euro Sitex Příbram', - 'description': 'Zápas VK Lvi Praha vs VK Euro Sitex Příbram 11.12.2022 v 19:00 na Volej.TV', + 'thumbnail': r're:https://.+\.(png|jpeg)', + 'title': 'FRA-CZE (2024-09-06)', + 'categories': ['mládež'], + 'series': 'Mistrovství Evropy do 20 let', + 'season': '2024-2025', + 'timestamp': 1725627600, + 'upload_date': '20240906', + }, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - json_data = self._search_json( - r'<\s*!\[CDATA[^=]+=', webpage, 'CDATA', video_id) - formats, subtitle = self._extract_m3u8_formats_and_subtitles( - json_data['urls']['hls'], video_id) - return { + json_data = self._call_api(f'match/{video_id}', video_id) + + formats = [] + for video in traverse_obj(json_data, ('videos', 0, 'qualities', lambda _, v: url_or_none(v['cloud_front_path']))): + formats.append(traverse_obj(video, { + 'url': 'cloud_front_path', + 'tbr': ('quality', {int_or_none}), + 'format_id': ('id', {str_or_none}), + 'height': ('quality', {self.TBR_HEIGHT_MAPPING.get}), + })) + + data = { 'id': video_id, - 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), - 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage), - 'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage), + **traverse_obj(json_data, { + 'series': ('competition_name', {str}), + 'season': ('season', {str}), + 'timestamp': ('match_time', {unified_timestamp}), + 'categories': ('category', ('title'), {str}, filter, all, filter), + 'thumbnail': ('poster', {url_or_none}), + }), 'formats': formats, - 'subtitles': subtitle, } + + teams = orderedSet(traverse_obj(json_data, ('teams', ..., 'shortcut', {str}))) + if len(teams) > 2 and 'FIN' in teams: + teams.remove('FIN') + + data['title'] = join_nonempty( + join_nonempty(*teams, delim='-'), + strftime_or_none(data.get('timestamp'), '(%Y-%m-%d)'), + delim=' ') + + return data + + +class VolejTVPlaylistBaseIE(VolejTVBaseIE): + """Subclasses must set _API_FILTER, _PAGE_SIZE""" + + def _get_page(self, playlist_id, page): + return self._call_api( + f'match/{self._API_FILTER}/{playlist_id}', playlist_id, + query={'page': page + 1, 'take': self._PAGE_SIZE, 'order': 'DESC'}) + + def _entries(self, playlist_id, first_page_data, page): + entries = first_page_data if page == 0 else self._get_page(playlist_id, page) + for match_id in traverse_obj(entries, ('data', ..., 'id')): + yield self.url_result(f'https://volej.tv/match/{match_id}', VolejTVIE) + + +class VolejTVClubPlaylistIE(VolejTVPlaylistBaseIE): + IE_NAME = 'volejtv:club' + _VALID_URL = r'https?://volej\.tv/klub/(?P\d+)' + _TESTS = [{ + 'url': 'https://volej.tv/klub/1173', + 'info_dict': { + 'id': '1173', + 'title': 'VK Jihostroj České Budějovice', + }, + 'playlist_mincount': 30, + }] + _API_FILTER = 'by-team-id-paginated' + _PAGE_SIZE = 6 + + def _real_extract(self, url): + playlist_id = self._match_id(url) + title = self._call_api(f'team/show/{playlist_id}', playlist_id)['title'] + first_page_data = self._get_page(playlist_id, 0) + total_pages = traverse_obj(first_page_data, ('meta', 'pageCount', {int}, {require('page count')})) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, playlist_id, first_page_data), + total_pages, self._PAGE_SIZE), playlist_id, title) + + +class VolejTVCategoryPlaylistIE(VolejTVPlaylistBaseIE): + IE_NAME = 'volejtv:category' + _VALID_URL = r'https?://volej\.tv/kategorie/(?P[^/$?]+)' + _TESTS = [{ + 'url': 'https://volej.tv/kategorie/chance-cesky-pohar', + 'info_dict': { + 'id': 'chance-cesky-pohar', + 'title': 'Chance Český pohár', + }, + 'playlist_mincount': 30, + }] + _API_FILTER = 'by-category-id-paginated' + _PAGE_SIZE = 10 + + def _get_category(self, playlist_id): + categories = self._call_api('category', playlist_id) + for category in traverse_obj(categories, (lambda _, v: v['slug'] and v['id'] and v['title'])): + if category['slug'] == playlist_id: + return category['id'], category['title'] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + category_id, title = self._get_category(playlist_id) + first_page_data = self._get_page(category_id, 0) + total_pages = traverse_obj(first_page_data, ('meta', 'pageCount', {int}, {require('page count')})) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, category_id, first_page_data), + total_pages, self._PAGE_SIZE), playlist_id, title)