1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-06-27 17:08:32 +00:00

Merge remote-tracking branch 'upstream/master' into wait-retries

This commit is contained in:
Paul Storkman 2025-05-14 04:11:29 +02:00
commit 03624e625b
13 changed files with 433 additions and 344 deletions

View File

@ -338,7 +338,6 @@
from .canalplus import CanalplusIE from .canalplus import CanalplusIE
from .canalsurmas import CanalsurmasIE from .canalsurmas import CanalsurmasIE
from .caracoltv import CaracolTvPlayIE from .caracoltv import CaracolTvPlayIE
from .cartoonnetwork import CartoonNetworkIE
from .cbc import ( from .cbc import (
CBCIE, CBCIE,
CBCGemIE, CBCGemIE,
@ -929,7 +928,10 @@
) )
from .jiosaavn import ( from .jiosaavn import (
JioSaavnAlbumIE, JioSaavnAlbumIE,
JioSaavnArtistIE,
JioSaavnPlaylistIE, JioSaavnPlaylistIE,
JioSaavnShowIE,
JioSaavnShowPlaylistIE,
JioSaavnSongIE, JioSaavnSongIE,
) )
from .joj import JojIE from .joj import JojIE
@ -1964,7 +1966,6 @@
SpreakerShowIE, SpreakerShowIE,
) )
from .springboardplatform import SpringboardPlatformIE from .springboardplatform import SpringboardPlatformIE
from .sprout import SproutIE
from .sproutvideo import ( from .sproutvideo import (
SproutVideoIE, SproutVideoIE,
VidsIoIE, VidsIoIE,

View File

@ -1,32 +1,24 @@
import re from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from .theplatform import ThePlatformIE from ..utils.traversal import traverse_obj
from ..utils import (
int_or_none,
parse_age_limit,
try_get,
update_url_query,
)
class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE class AMCNetworksIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/?#]+)+)/[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', 'url': 'https://www.amc.com/shows/dark-winds/videos/dark-winds-a-look-at-season-3--1072027',
'info_dict': { 'info_dict': {
'id': '4Lq1dzOnZGt0', 'id': '6369261343112',
'ext': 'mp4', 'ext': 'mp4',
'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", 'title': 'Dark Winds: A Look at Season 3',
'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", 'uploader_id': '6240731308001',
'upload_date': '20201120', 'duration': 176.427,
'timestamp': 1605904350, 'thumbnail': r're:https://[^/]+\.boltdns\.net/.+/image\.jpg',
'uploader': 'AMCN', 'tags': [],
'timestamp': 1740414792,
'upload_date': '20250224',
}, },
'params': { 'params': {'skip_download': 'm3u8'},
# m3u8 download
'skip_download': True,
},
'skip': '404 Not Found',
}, { }, {
'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge',
'only_matching': True, 'only_matching': True,
@ -52,96 +44,18 @@ class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1',
'only_matching': True, 'only_matching': True,
}] }]
_REQUESTOR_ID_MAP = {
'amc': 'AMC',
'bbcamerica': 'BBCA',
'ifc': 'IFC',
'sundancetv': 'SUNDANCE',
'wetv': 'WETV',
}
def _real_extract(self, url): def _real_extract(self, url):
site, display_id = self._match_valid_url(url).groups() display_id = self._match_id(url)
requestor_id = self._REQUESTOR_ID_MAP[site] webpage = self._download_webpage(url, display_id)
page_data = self._download_json( initial_data = self._search_json(
f'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/{requestor_id.lower()}/url/{display_id}', r'window\.initialData\s*=\s*JSON\.parse\(String\.raw`', webpage, 'initial data', display_id)
display_id)['data'] video_id = traverse_obj(initial_data, ('initialData', 'properties', 'videoId', {str}))
properties = page_data.get('properties') or {} if not video_id: # All locked videos are now DRM-protected
query = { self.report_drm(display_id)
'mbr': 'true', account_id = initial_data['config']['brightcove']['accountId']
'manifest': 'm3u', player_id = initial_data['config']['brightcove']['playerId']
}
video_player_count = 0 return self.url_result(
try: f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}',
for v in page_data['children']: BrightcoveNewIE, video_id)
if v.get('type') == 'video-player':
release_pid = v['properties']['currentVideo']['meta']['releasePid']
tp_path = 'M_UwQC/' + release_pid
media_url = 'https://link.theplatform.com/s/' + tp_path
video_player_count += 1
except KeyError:
pass
if video_player_count > 1:
self.report_warning(
f'The JSON data has {video_player_count} video players. Only one will be extracted')
# Fall back to videoPid if releasePid not found.
# TODO: Fall back to videoPid if releasePid manifest uses DRM.
if not video_player_count:
tp_path = 'M_UwQC/media/' + properties['videoPid']
media_url = 'https://link.theplatform.com/s/' + tp_path
theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
video_id = theplatform_metadata['pid']
title = theplatform_metadata['title']
rating = try_get(
theplatform_metadata, lambda x: x['ratings'][0]['rating'])
video_category = properties.get('videoCategory')
if video_category and video_category.endswith('-Auth'):
resource = self._get_mvpd_resource(
requestor_id, title, video_id, rating)
query['auth'] = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)
media_url = update_url_query(media_url, query)
formats, subtitles = self._extract_theplatform_smil(
media_url, video_id)
thumbnails = []
thumbnail_urls = [properties.get('imageDesktop')]
if 'thumbnail' in info:
thumbnail_urls.append(info.pop('thumbnail'))
for thumbnail_url in thumbnail_urls:
if not thumbnail_url:
continue
mobj = re.search(r'(\d+)x(\d+)', thumbnail_url)
thumbnails.append({
'url': thumbnail_url,
'width': int(mobj.group(1)) if mobj else None,
'height': int(mobj.group(2)) if mobj else None,
})
info.update({
'age_limit': parse_age_limit(rating),
'formats': formats,
'id': video_id,
'subtitles': subtitles,
'thumbnails': thumbnails,
})
ns_keys = theplatform_metadata.get('$xmlns', {}).keys()
if ns_keys:
ns = next(iter(ns_keys))
episode = theplatform_metadata.get(ns + '$episodeTitle') or None
episode_number = int_or_none(
theplatform_metadata.get(ns + '$episode'))
season_number = int_or_none(
theplatform_metadata.get(ns + '$season'))
series = theplatform_metadata.get(ns + '$show') or None
info.update({
'episode': episode,
'episode_number': episode_number,
'season_number': season_number,
'series': series,
})
return info

View File

@ -1,59 +0,0 @@
from .turner import TurnerBaseIE
from ..utils import int_or_none
class CartoonNetworkIE(TurnerBaseIE):
_VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html'
_TEST = {
'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html',
'info_dict': {
'id': '6e3375097f63874ebccec7ef677c1c3845fa850e',
'ext': 'mp4',
'title': 'How to Draw Upgrade',
'description': 'md5:2061d83776db7e8be4879684eefe8c0f',
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False):
metadata_re = ''
if content_re:
metadata_re = r'|video_metadata\.content_' + content_re
return self._search_regex(
rf'(?:_cnglobal\.currentVideo\.{global_re}{metadata_re})\s*=\s*"({value_re})";',
webpage, name, fatal=fatal)
media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True)
title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True)
info = self._extract_ngtv_info(
media_id, {'networkId': 'cartoonnetwork'}, {
'url': url,
'site_name': 'CartoonNetwork',
'auth_required': find_field('authType', 'auth type') != 'unauth',
})
series = find_field(
'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage)
info.update({
'id': media_id,
'display_id': display_id,
'title': title,
'description': self._html_search_meta('description', webpage),
'series': series,
'episode': title,
})
for field in ('season', 'episode'):
field_name = field + 'Number'
info[field + '_number'] = int_or_none(find_field(
field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage))
return info

View File

@ -1,23 +1,33 @@
import functools import functools
import itertools
import math import math
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
InAdvancePagedList, InAdvancePagedList,
ISO639Utils,
OnDemandPagedList,
clean_html, clean_html,
int_or_none, int_or_none,
js_to_json,
make_archive_id, make_archive_id,
orderedSet,
smuggle_url, smuggle_url,
unified_strdate,
unified_timestamp,
unsmuggle_url, unsmuggle_url,
url_basename, url_basename,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin,
variadic,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import traverse_obj
class JioSaavnBaseIE(InfoExtractor): class JioSaavnBaseIE(InfoExtractor):
_URL_BASE_RE = r'https?://(?:www\.)?(?:jio)?saavn\.com'
_API_URL = 'https://www.jiosaavn.com/api.php' _API_URL = 'https://www.jiosaavn.com/api.php'
_VALID_BITRATES = {'16', '32', '64', '128', '320'} _VALID_BITRATES = {'16', '32', '64', '128', '320'}
@ -30,16 +40,20 @@ def requested_bitrates(self):
f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}') f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}')
return requested_bitrates return requested_bitrates
def _extract_formats(self, song_data): def _extract_formats(self, item_data):
# Show/episode JSON data has a slightly different structure than song JSON data
if media_url := traverse_obj(item_data, ('more_info', 'encrypted_media_url', {str})):
item_data.setdefault('encrypted_media_url', media_url)
for bitrate in self.requested_bitrates: for bitrate in self.requested_bitrates:
media_data = self._download_json( media_data = self._download_json(
self._API_URL, song_data['id'], self._API_URL, item_data['id'],
f'Downloading format info for {bitrate}', f'Downloading format info for {bitrate}',
fatal=False, data=urlencode_postdata({ fatal=False, data=urlencode_postdata({
'__call': 'song.generateAuthToken', '__call': 'song.generateAuthToken',
'_format': 'json', '_format': 'json',
'bitrate': bitrate, 'bitrate': bitrate,
'url': song_data['encrypted_media_url'], 'url': item_data['encrypted_media_url'],
})) }))
if not traverse_obj(media_data, ('auth_url', {url_or_none})): if not traverse_obj(media_data, ('auth_url', {url_or_none})):
self.report_warning(f'Unable to extract format info for {bitrate}') self.report_warning(f'Unable to extract format info for {bitrate}')
@ -53,24 +67,6 @@ def _extract_formats(self, song_data):
'vcodec': 'none', 'vcodec': 'none',
} }
def _extract_song(self, song_data, url=None):
info = traverse_obj(song_data, {
'id': ('id', {str}),
'title': ('song', {clean_html}),
'album': ('album', {clean_html}),
'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}),
'duration': ('duration', {int_or_none}),
'view_count': ('play_count', {int_or_none}),
'release_year': ('year', {int_or_none}),
'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}),
'webpage_url': ('perma_url', {url_or_none}),
})
if webpage_url := info.get('webpage_url') or url:
info['display_id'] = url_basename(webpage_url)
info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])]
return info
def _call_api(self, type_, token, note='API', params={}): def _call_api(self, type_, token, note='API', params={}):
return self._download_json( return self._download_json(
self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON', self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON',
@ -84,19 +80,89 @@ def _call_api(self, type_, token, note='API', params={}):
**params, **params,
}) })
def _yield_songs(self, playlist_data): @staticmethod
for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])): def _extract_song(song_data, url=None):
song_info = self._extract_song(song_data) info = traverse_obj(song_data, {
url = smuggle_url(song_info['webpage_url'], { 'id': ('id', {str}),
'id': song_data['id'], 'title': (('song', 'title'), {clean_html}, any),
'encrypted_media_url': song_data['encrypted_media_url'], 'album': ((None, 'more_info'), 'album', {clean_html}, any),
}) 'duration': ((None, 'more_info'), 'duration', {int_or_none}, any),
yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) 'channel': ((None, 'more_info'), 'label', {str}, any),
'channel_id': ((None, 'more_info'), 'label_id', {str}, any),
'channel_url': ((None, 'more_info'), 'label_url', {urljoin('https://www.jiosaavn.com/')}, any),
'release_date': ((None, 'more_info'), 'release_date', {unified_strdate}, any),
'release_year': ('year', {int_or_none}),
'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}),
'view_count': ('play_count', {int_or_none}),
'language': ('language', {lambda x: ISO639Utils.short2long(x.casefold()) or 'und'}),
'webpage_url': ('perma_url', {url_or_none}),
'artists': ('more_info', 'artistMap', 'primary_artists', ..., 'name', {str}, filter, all),
})
if webpage_url := info.get('webpage_url') or url:
info['display_id'] = url_basename(webpage_url)
info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])]
if primary_artists := traverse_obj(song_data, ('primary_artists', {lambda x: x.split(', ') if x else None})):
info['artists'].extend(primary_artists)
if featured_artists := traverse_obj(song_data, ('featured_artists', {str}, filter)):
info['artists'].extend(featured_artists.split(', '))
info['artists'] = orderedSet(info['artists']) or None
return info
@staticmethod
def _extract_episode(episode_data, url=None):
info = JioSaavnBaseIE._extract_song(episode_data, url)
info.pop('_old_archive_ids', None)
info.update(traverse_obj(episode_data, {
'description': ('more_info', 'description', {str}),
'timestamp': ('more_info', 'release_time', {unified_timestamp}),
'series': ('more_info', 'show_title', {str}),
'series_id': ('more_info', 'show_id', {str}),
'season': ('more_info', 'season_title', {str}),
'season_number': ('more_info', 'season_no', {int_or_none}),
'season_id': ('more_info', 'season_id', {str}),
'episode_number': ('more_info', 'episode_number', {int_or_none}),
'cast': ('starring', {lambda x: x.split(', ') if x else None}),
}))
return info
def _extract_jiosaavn_result(self, url, endpoint, response_key, parse_func):
url, smuggled_data = unsmuggle_url(url)
data = traverse_obj(smuggled_data, ({
'id': ('id', {str}),
'encrypted_media_url': ('encrypted_media_url', {str}),
}))
if 'id' in data and 'encrypted_media_url' in data:
result = {'id': data['id']}
else:
# only extract metadata if this is not a url_transparent result
data = self._call_api(endpoint, self._match_id(url))[response_key][0]
result = parse_func(data, url)
result['formats'] = list(self._extract_formats(data))
return result
def _yield_items(self, playlist_data, keys=None, parse_func=None):
"""Subclasses using this method must set _ENTRY_IE"""
if parse_func is None:
parse_func = self._extract_song
for item_data in traverse_obj(playlist_data, (
*variadic(keys, (str, bytes, dict, set)), lambda _, v: v['id'] and v['perma_url'],
)):
info = parse_func(item_data)
url = smuggle_url(info['webpage_url'], traverse_obj(item_data, {
'id': ('id', {str}),
'encrypted_media_url': ((None, 'more_info'), 'encrypted_media_url', {str}, any),
}))
yield self.url_result(url, self._ENTRY_IE, url_transparent=True, **info)
class JioSaavnSongIE(JioSaavnBaseIE): class JioSaavnSongIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:song' IE_NAME = 'jiosaavn:song'
_VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)' _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'(?:/song/[^/?#]+/|/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk',
'md5': '3b84396d15ed9e083c3106f1fa589c04', 'md5': '3b84396d15ed9e083c3106f1fa589c04',
@ -106,12 +172,38 @@ class JioSaavnSongIE(JioSaavnBaseIE):
'ext': 'm4a', 'ext': 'm4a',
'title': 'Leja Re', 'title': 'Leja Re',
'album': 'Leja Re', 'album': 'Leja Re',
'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', 'thumbnail': r're:https?://.+/.+\.jpg',
'duration': 205, 'duration': 205,
'view_count': int, 'view_count': int,
'release_year': 2018, 'release_year': 2018,
'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'], 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'],
'_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'], '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'],
'channel': 'T-Series',
'language': 'hin',
'channel_id': '34297',
'channel_url': 'https://www.jiosaavn.com/label/t-series-albums/6DLuXO3VoTo_',
'release_date': '20181124',
},
}, {
'url': 'https://www.jiosaavn.com/song/chuttamalle/P1FfWjZkQ0Q',
'md5': '96296c58d6ce488a417ef0728fd2d680',
'info_dict': {
'id': 'O94kBTtw',
'display_id': 'P1FfWjZkQ0Q',
'ext': 'm4a',
'title': 'Chuttamalle',
'album': 'Devara Part 1 - Telugu',
'thumbnail': r're:https?://.+/.+\.jpg',
'duration': 222,
'view_count': int,
'release_year': 2024,
'artists': 'count:3',
'_old_archive_ids': ['jiosaavnsong P1FfWjZkQ0Q'],
'channel': 'T-Series',
'language': 'tel',
'channel_id': '34297',
'channel_url': 'https://www.jiosaavn.com/label/t-series-albums/6DLuXO3VoTo_',
'release_date': '20240926',
}, },
}, { }, {
'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU',
@ -119,26 +211,51 @@ class JioSaavnSongIE(JioSaavnBaseIE):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url) return self._extract_jiosaavn_result(url, 'song', 'songs', self._extract_song)
song_data = traverse_obj(smuggled_data, ({
'id': ('id', {str}),
'encrypted_media_url': ('encrypted_media_url', {str}),
}))
if 'id' in song_data and 'encrypted_media_url' in song_data:
result = {'id': song_data['id']}
else:
# only extract metadata if this is not a url_transparent result
song_data = self._call_api('song', self._match_id(url))['songs'][0]
result = self._extract_song(song_data, url)
result['formats'] = list(self._extract_formats(song_data)) class JioSaavnShowIE(JioSaavnBaseIE):
return result IE_NAME = 'jiosaavn:show'
_VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/shows/[^/?#]+/(?P<id>[^/?#]{11,})/?(?:$|[?#])'
_TESTS = [{
'url': 'https://www.jiosaavn.com/shows/non-food-ways-to-boost-your-energy/XFMcKICOCgc_',
'md5': '0733cd254cfe74ef88bea1eaedcf1f4f',
'info_dict': {
'id': 'qqzh3RKZ',
'display_id': 'XFMcKICOCgc_',
'ext': 'mp3',
'title': 'Non-Food Ways To Boost Your Energy',
'description': 'md5:26e7129644b5c6aada32b8851c3997c8',
'episode': 'Episode 1',
'timestamp': 1640563200,
'series': 'Holistic Lifestyle With Neha Ranglani',
'series_id': '52397',
'season': 'Holistic Lifestyle With Neha Ranglani',
'season_number': 1,
'season_id': '61273',
'thumbnail': r're:https?://.+/.+\.jpg',
'duration': 311,
'view_count': int,
'release_year': 2021,
'language': 'eng',
'channel': 'Saavn OG',
'channel_id': '1953876',
'episode_number': 1,
'upload_date': '20211227',
'release_date': '20211227',
},
}, {
'url': 'https://www.jiosaavn.com/shows/himesh-reshammiya/Kr8fmfSN4vo_',
'only_matching': True,
}]
def _real_extract(self, url):
return self._extract_jiosaavn_result(url, 'episode', 'episodes', self._extract_episode)
class JioSaavnAlbumIE(JioSaavnBaseIE): class JioSaavnAlbumIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:album' IE_NAME = 'jiosaavn:album'
_VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P<id>[^/?#]+)' _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/album/[^/?#]+/(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_',
'info_dict': { 'info_dict': {
@ -147,18 +264,19 @@ class JioSaavnAlbumIE(JioSaavnBaseIE):
}, },
'playlist_count': 10, 'playlist_count': 10,
}] }]
_ENTRY_IE = JioSaavnSongIE
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
album_data = self._call_api('album', display_id) album_data = self._call_api('album', display_id)
return self.playlist_result( return self.playlist_result(
self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str}))) self._yield_items(album_data, 'songs'), display_id, traverse_obj(album_data, ('title', {str})))
class JioSaavnPlaylistIE(JioSaavnBaseIE): class JioSaavnPlaylistIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:playlist' IE_NAME = 'jiosaavn:playlist'
_VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P<id>[^/?#]+)' _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__', 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__',
'info_dict': { 'info_dict': {
@ -172,15 +290,16 @@ class JioSaavnPlaylistIE(JioSaavnBaseIE):
'id': 'DVR,pFUOwyXqIp77B1JF,A__', 'id': 'DVR,pFUOwyXqIp77B1JF,A__',
'title': 'Mood Hindi', 'title': 'Mood Hindi',
}, },
'playlist_mincount': 801, 'playlist_mincount': 750,
}, { }, {
'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_', 'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_',
'info_dict': { 'info_dict': {
'id': 'Me5RridRfDk_', 'id': 'Me5RridRfDk_',
'title': 'Taaza Tunes', 'title': 'Taaza Tunes',
}, },
'playlist_mincount': 301, 'playlist_mincount': 50,
}] }]
_ENTRY_IE = JioSaavnSongIE
_PAGE_SIZE = 50 _PAGE_SIZE = 50
def _fetch_page(self, token, page): def _fetch_page(self, token, page):
@ -189,7 +308,7 @@ def _fetch_page(self, token, page):
def _entries(self, token, first_page_data, page): def _entries(self, token, first_page_data, page):
page_data = first_page_data if not page else self._fetch_page(token, page + 1) page_data = first_page_data if not page else self._fetch_page(token, page + 1)
yield from self._yield_songs(page_data) yield from self._yield_items(page_data, 'songs')
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
@ -199,3 +318,95 @@ def _real_extract(self, url):
return self.playlist_result(InAdvancePagedList( return self.playlist_result(InAdvancePagedList(
functools.partial(self._entries, display_id, playlist_data), functools.partial(self._entries, display_id, playlist_data),
total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str}))) total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str})))
class JioSaavnShowPlaylistIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:show:playlist'
_VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/shows/(?P<show>[^#/?]+)/(?P<season>\d+)/[^/?#]+'
_TESTS = [{
'url': 'https://www.jiosaavn.com/shows/talking-music/1/PjReFP-Sguk_',
'info_dict': {
'id': 'talking-music-1',
'title': 'Talking Music',
},
'playlist_mincount': 11,
}]
_ENTRY_IE = JioSaavnShowIE
_PAGE_SIZE = 10
def _fetch_page(self, show_id, season_id, page):
return self._call_api('show', show_id, f'show page {page}', {
'p': page,
'__call': 'show.getAllEpisodes',
'show_id': show_id,
'season_number': season_id,
'api_version': '4',
'sort_order': 'desc',
})
def _entries(self, show_id, season_id, page):
page_data = self._fetch_page(show_id, season_id, page + 1)
yield from self._yield_items(page_data, keys=None, parse_func=self._extract_episode)
def _real_extract(self, url):
show_slug, season_id = self._match_valid_url(url).group('show', 'season')
playlist_id = f'{show_slug}-{season_id}'
webpage = self._download_webpage(url, playlist_id)
show_info = self._search_json(
r'window\.__INITIAL_DATA__\s*=', webpage, 'initial data',
playlist_id, transform_source=js_to_json)['showView']
show_id = show_info['current_id']
entries = OnDemandPagedList(functools.partial(self._entries, show_id, season_id), self._PAGE_SIZE)
return self.playlist_result(
entries, playlist_id, traverse_obj(show_info, ('show', 'title', 'text', {str})))
class JioSaavnArtistIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:artist'
_VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/artist/[^/?#]+/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.jiosaavn.com/artist/krsna-songs/rYLBEve2z3U_',
'info_dict': {
'id': 'rYLBEve2z3U_',
'title': 'KR$NA',
},
'playlist_mincount': 38,
}, {
'url': 'https://www.jiosaavn.com/artist/sanam-puri-songs/SkNEv3qRhDE_',
'info_dict': {
'id': 'SkNEv3qRhDE_',
'title': 'Sanam Puri',
},
'playlist_mincount': 51,
}]
_ENTRY_IE = JioSaavnSongIE
_PAGE_SIZE = 50
def _fetch_page(self, artist_id, page):
return self._call_api('artist', artist_id, f'artist page {page + 1}', {
'p': page,
'n_song': self._PAGE_SIZE,
'n_album': self._PAGE_SIZE,
'sub_type': '',
'includeMetaTags': '',
'api_version': '4',
'category': 'alphabetical',
'sort_order': 'asc',
})
def _entries(self, artist_id, first_page):
for page in itertools.count():
playlist_data = first_page if not page else self._fetch_page(artist_id, page)
if not traverse_obj(playlist_data, ('topSongs', ..., {dict})):
break
yield from self._yield_items(playlist_data, 'topSongs')
def _real_extract(self, url):
artist_id = self._match_id(url)
first_page = self._fetch_page(artist_id, 0)
return self.playlist_result(
self._entries(artist_id, first_page), artist_id,
traverse_obj(first_page, ('name', {str})))

View File

@ -16,6 +16,7 @@
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none, int_or_none,
parse_bitrate,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_qs, parse_qs,
@ -23,7 +24,6 @@
qualities, qualities,
remove_start, remove_start,
str_or_none, str_or_none,
try_get,
unescapeHTML, unescapeHTML,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
@ -785,8 +785,6 @@ class NiconicoLiveIE(NiconicoBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
_KNOWN_LATENCY = ('high', 'low')
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id)
@ -802,22 +800,19 @@ def _real_extract(self, url):
}) })
hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.') hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.')
latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
if latency not in self._KNOWN_LATENCY:
latency = 'high'
ws = self._request_webpage( ws = self._request_webpage(
Request(ws_url, headers={'Origin': f'https://{hostname}'}), Request(ws_url, headers={'Origin': f'https://{hostname}'}),
video_id=video_id, note='Connecting to WebSocket server') video_id=video_id, note='Connecting to WebSocket server')
self.write_debug('[debug] Sending HLS server request') self.write_debug('Sending HLS server request')
ws.send(json.dumps({ ws.send(json.dumps({
'type': 'startWatching', 'type': 'startWatching',
'data': { 'data': {
'stream': { 'stream': {
'quality': 'abr', 'quality': 'abr',
'protocol': 'hls+fmp4', 'protocol': 'hls',
'latency': latency, 'latency': 'high',
'accessRightMethod': 'single_cookie', 'accessRightMethod': 'single_cookie',
'chasePlay': False, 'chasePlay': False,
}, },
@ -881,18 +876,29 @@ def _real_extract(self, url):
for cookie in cookies: for cookie in cookies:
self._set_cookie( self._set_cookie(
cookie['domain'], cookie['name'], cookie['value'], cookie['domain'], cookie['name'], cookie['value'],
expire_time=unified_timestamp(cookie['expires']), path=cookie['path'], secure=cookie['secure']) expire_time=unified_timestamp(cookie.get('expires')), path=cookie['path'], secure=cookie['secure'])
fmt_common = {
'live_latency': 'high',
'origin': hostname,
'protocol': 'niconico_live',
'video_id': video_id,
'ws': ws,
}
q_iter = (q for q in qualities[1:] if not q.startswith('audio_')) # ignore initial 'abr'
a_map = {96: 'audio_low', 192: 'audio_high'}
formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True)
for fmt, q in zip(formats, reversed(qualities[1:])): for fmt in formats:
fmt.update({ if fmt.get('acodec') == 'none':
'format_id': q, fmt['format_id'] = next(q_iter, fmt['format_id'])
'protocol': 'niconico_live', elif fmt.get('vcodec') == 'none':
'ws': ws, abr = parse_bitrate(fmt['url'].lower())
'video_id': video_id, fmt.update({
'live_latency': latency, 'abr': abr,
'origin': hostname, 'format_id': a_map.get(abr, fmt['format_id']),
}) })
fmt.update(fmt_common)
return { return {
'id': video_id, 'id': video_id,

View File

@ -181,6 +181,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 119.0, 'duration': 119.0,
}, },
'skip': 'HTTP Error 500: Internal Server Error',
}, { }, {
# article with audio and no video # article with audio and no video
'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html', 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html',
@ -190,13 +191,14 @@ class NYTimesArticleIE(NYTimesBaseIE):
'ext': 'mp3', 'ext': 'mp3',
'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?', 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?',
'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e', 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e',
'timestamp': 1695960700, 'timestamp': 1696008129,
'upload_date': '20230929', 'upload_date': '20230929',
'creator': 'Stephanie Nolen, Natalija Gormalova', 'creators': ['Stephanie Nolen', 'Natalija Gormalova'],
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 1322, 'duration': 1322,
}, },
}, { }, {
# lede_media_block already has sourceId
'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html', 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html',
'md5': '3eb5ddb1d6f86254fe4f233826778737', 'md5': '3eb5ddb1d6f86254fe4f233826778737',
'info_dict': { 'info_dict': {
@ -207,7 +209,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
'timestamp': 1701290997, 'timestamp': 1701290997,
'upload_date': '20231129', 'upload_date': '20231129',
'uploader': 'By The New York Times', 'uploader': 'By The New York Times',
'creator': 'Katie Rogers', 'creators': ['Katie Rogers'],
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 97.631, 'duration': 97.631,
}, },
@ -222,10 +224,22 @@ class NYTimesArticleIE(NYTimesBaseIE):
'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink', 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink',
'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d', 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d',
'upload_date': '20231202', 'upload_date': '20231202',
'creator': 'Emily Steel, Sydney Ember', 'creators': ['Emily Steel', 'Sydney Ember'],
'timestamp': 1701511264, 'timestamp': 1701511264,
}, },
'playlist_count': 3, 'playlist_count': 3,
}, {
# lede_media_block does not have sourceId
'url': 'https://www.nytimes.com/2025/04/30/well/move/hip-mobility-routine.html',
'info_dict': {
'id': 'hip-mobility-routine',
'title': 'Tight Hips? These Moves Can Help.',
'description': 'Sitting all day is hard on your hips. Try this simple routine for better mobility.',
'creators': ['Alyssa Ages', 'Theodore Tae'],
'timestamp': 1746003629,
'upload_date': '20250430',
},
'playlist_count': 7,
}, { }, {
'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html', 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html',
'only_matching': True, 'only_matching': True,
@ -256,14 +270,18 @@ def _extract_content_from_block(self, block):
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id) webpage = self._download_webpage(url, page_id, impersonate=True)
art_json = self._search_json( art_json = self._search_json(
r'window\.__preloadedData\s*=', webpage, 'media details', page_id, r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article'] transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
content = art_json['sprinkledBody']['content']
blocks = traverse_obj(art_json, ( blocks = []
'sprinkledBody', 'content', ..., ('ledeMedia', None), block_filter = lambda k, v: k == 'media' and v['__typename'] in ('Video', 'Audio')
lambda _, v: v['__typename'] in ('Video', 'Audio'))) if lede_media_block := traverse_obj(content, (..., 'ledeMedia', block_filter, any)):
lede_media_block.setdefault('sourceId', art_json.get('sourceId'))
blocks.append(lede_media_block)
blocks.extend(traverse_obj(content, (..., block_filter)))
if not blocks: if not blocks:
raise ExtractorError('Unable to extract any media blocks from webpage') raise ExtractorError('Unable to extract any media blocks from webpage')
@ -273,8 +291,7 @@ def _real_extract(self, url):
'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}), 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage), get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})), 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
'creator': ', '.join( 'creators': traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName', {str})),
traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list)
'thumbnails': self._extract_thumbnails(traverse_obj( 'thumbnails': self._extract_thumbnails(traverse_obj(
art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))), art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
} }

View File

@ -7,11 +7,13 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
join_nonempty,
parse_qs, parse_qs,
traverse_obj, traverse_obj,
update_url_query, update_url_query,
urlencode_postdata, urlencode_postdata,
) )
from ..utils.traversal import unpack
class PlaySuisseIE(InfoExtractor): class PlaySuisseIE(InfoExtractor):
@ -26,12 +28,12 @@ class PlaySuisseIE(InfoExtractor):
{ {
# episode in a series # episode in a series
'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211',
'md5': '82df2a470b2dfa60c2d33772a8a60cf8', 'md5': 'e20d1ede6872a03b41905ca1060a1ef2',
'info_dict': { 'info_dict': {
'id': '763211', 'id': '763211',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Knochen', 'title': 'Knochen',
'description': 'md5:8ea7a8076ba000cd9e8bc132fd0afdd8', 'description': 'md5:3bdd80e2ce20227c47aab1df2a79a519',
'duration': 3344, 'duration': 3344,
'series': 'Wilder', 'series': 'Wilder',
'season': 'Season 1', 'season': 'Season 1',
@ -42,24 +44,33 @@ class PlaySuisseIE(InfoExtractor):
}, },
}, { }, {
# film # film
'url': 'https://www.playsuisse.ch/watch/808675', 'url': 'https://www.playsuisse.ch/detail/2573198',
'md5': '818b94c1d2d7c4beef953f12cb8f3e75', 'md5': '1f115bb0a5191477b1a5771643a4283d',
'info_dict': { 'info_dict': {
'id': '808675', 'id': '2573198',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Der Läufer', 'title': 'Azor',
'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'duration': 5280, 'genres': ['Fiction'],
'creators': ['Andreas Fontana'],
'cast': ['Fabrizio Rongione', 'Stéphanie Cléau', 'Gilles Privat', 'Alexandre Trocki'],
'location': 'France; Argentine',
'release_year': 2021,
'duration': 5981,
'thumbnail': 're:https://playsuisse-img.akamaized.net/', 'thumbnail': 're:https://playsuisse-img.akamaized.net/',
}, },
}, { }, {
# series (treated as a playlist) # series (treated as a playlist)
'url': 'https://www.playsuisse.ch/detail/1115687', 'url': 'https://www.playsuisse.ch/detail/1115687',
'info_dict': { 'info_dict': {
'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3',
'id': '1115687', 'id': '1115687',
'series': 'They all came out to Montreux', 'series': 'They all came out to Montreux',
'title': 'They all came out to Montreux', 'title': 'They all came out to Montreux',
'description': 'md5:0fefd8c5b4468a0bb35e916887681520',
'genres': ['Documentary'],
'creators': ['Oliver Murray'],
'location': 'Switzerland',
'release_year': 2021,
}, },
'playlist': [{ 'playlist': [{
'info_dict': { 'info_dict': {
@ -120,6 +131,12 @@ class PlaySuisseIE(InfoExtractor):
id id
name name
description description
descriptionLong
year
contentTypes
directors
mainCast
productionCountries
duration duration
episodeNumber episodeNumber
seasonNumber seasonNumber
@ -215,9 +232,7 @@ def _perform_login(self, username, password):
if not self._ID_TOKEN: if not self._ID_TOKEN:
raise ExtractorError('Login failed') raise ExtractorError('Login failed')
def _get_media_data(self, media_id): def _get_media_data(self, media_id, locale=None):
# NOTE In the web app, the "locale" header is used to switch between languages,
# However this doesn't seem to take effect when passing the header here.
response = self._download_json( response = self._download_json(
'https://www.playsuisse.ch/api/graphql', 'https://www.playsuisse.ch/api/graphql',
media_id, data=json.dumps({ media_id, data=json.dumps({
@ -225,7 +240,7 @@ def _get_media_data(self, media_id):
'query': self._GRAPHQL_QUERY, 'query': self._GRAPHQL_QUERY,
'variables': {'assetId': media_id}, 'variables': {'assetId': media_id},
}).encode(), }).encode(),
headers={'Content-Type': 'application/json', 'locale': 'de'}) headers={'Content-Type': 'application/json', 'locale': locale or 'de'})
return response['data']['assetV2'] return response['data']['assetV2']
@ -234,7 +249,7 @@ def _real_extract(self, url):
self.raise_login_required(method='password') self.raise_login_required(method='password')
media_id = self._match_id(url) media_id = self._match_id(url)
media_data = self._get_media_data(media_id) media_data = self._get_media_data(media_id, traverse_obj(parse_qs(url), ('locale', 0)))
info = self._extract_single(media_data) info = self._extract_single(media_data)
if media_data.get('episodes'): if media_data.get('episodes'):
info.update({ info.update({
@ -257,15 +272,22 @@ def _extract_single(self, media_data):
self._merge_subtitles(subs, target=subtitles) self._merge_subtitles(subs, target=subtitles)
return { return {
'id': media_data['id'],
'title': media_data.get('name'),
'description': media_data.get('description'),
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'duration': int_or_none(media_data.get('duration')),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'series': media_data.get('seriesName'), **traverse_obj(media_data, {
'season_number': int_or_none(media_data.get('seasonNumber')), 'id': ('id', {str}),
'episode': media_data.get('name') if media_data.get('episodeNumber') else None, 'title': ('name', {str}),
'episode_number': int_or_none(media_data.get('episodeNumber')), 'description': (('descriptionLong', 'description'), {str}, any),
'genres': ('contentTypes', ..., {str}),
'creators': ('directors', ..., {str}),
'cast': ('mainCast', ..., {str}),
'location': ('productionCountries', ..., {str}, all, {unpack(join_nonempty, delim='; ')}, filter),
'release_year': ('year', {str}, {lambda x: x[:4]}, {int_or_none}),
'duration': ('duration', {int_or_none}),
'series': ('seriesName', {str}),
'season_number': ('seasonNumber', {int_or_none}),
'episode': ('name', {str}, {lambda x: x if media_data['episodeNumber'] is not None else None}),
'episode_number': ('episodeNumber', {int_or_none}),
}),
} }

View File

@ -1,61 +0,0 @@
from .adobepass import AdobePassIE
from ..utils import (
int_or_none,
smuggle_url,
update_url_query,
)
class SproutIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race',
'info_dict': {
'id': 'bm0foJFaTKqb',
'ext': 'mp4',
'title': 'Robot Bike Race',
'description': 'md5:436b1d97117cc437f54c383f4debc66d',
'timestamp': 1606148940,
'upload_date': '20201123',
'uploader': 'NBCU-MPAT',
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.sproutonline.com/watch/cowboy-adventure',
'only_matching': True,
}, {
'url': 'https://www.universalkids.com/watch/robot-bike-race',
'only_matching': True,
}]
_GEO_COUNTRIES = ['US']
def _real_extract(self, url):
display_id = self._match_id(url)
mpx_metadata = self._download_json(
# http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/
'https://www.universalkids.com/_api/videos/' + display_id,
display_id)['mpxMetadata']
media_pid = mpx_metadata['mediaPid']
theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid
query = {
'mbr': 'true',
'manifest': 'm3u',
}
if mpx_metadata.get('entitlement') == 'auth':
query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout')
theplatform_url = smuggle_url(
update_url_query(theplatform_url, query), {
'force_smil_url': True,
'geo_countries': self._GEO_COUNTRIES,
})
return {
'_type': 'url_transparent',
'id': media_pid,
'url': theplatform_url,
'series': mpx_metadata.get('seriesName'),
'season_number': int_or_none(mpx_metadata.get('seasonNumber')),
'episode_number': int_or_none(mpx_metadata.get('episodeNumber')),
'ie_key': 'ThePlatform',
}

View File

@ -471,8 +471,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
title = self._og_search_title(webpage) title = self._og_search_title(webpage)
urql_state = self._search_json( urql_state = self._search_json(r'urqlState\s*[=:]', webpage, 'json data', display_id)
r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id)
data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {} data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {}

View File

@ -2,9 +2,11 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
join_nonempty,
try_get, try_get,
unified_strdate, unified_strdate,
) )
from ..utils.traversal import traverse_obj
class WatIE(InfoExtractor): class WatIE(InfoExtractor):
@ -70,8 +72,14 @@ def _real_extract(self, url):
error_desc = video_info.get('error_desc') error_desc = video_info.get('error_desc')
if error_desc: if error_desc:
if video_info.get('error_code') == 'GEOBLOCKED': error_code = video_info.get('error_code')
if error_code == 'GEOBLOCKED':
self.raise_geo_restricted(error_desc, video_info.get('geoList')) self.raise_geo_restricted(error_desc, video_info.get('geoList'))
elif error_code == 'DELIVERY_ERROR':
if traverse_obj(video_data, ('delivery', 'code')) == 500:
self.report_drm(video_id)
error_desc = join_nonempty(
error_desc, traverse_obj(video_data, ('delivery', 'error', {str})), delim=': ')
raise ExtractorError(error_desc, expected=True) raise ExtractorError(error_desc, expected=True)
title = video_info['title'] title = video_info['title']

View File

@ -37,6 +37,7 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor):
'chapters': 'count:20', 'chapters': 'count:20',
'comment_count': int, 'comment_count': int,
'heatmap': 'count:100', 'heatmap': 'count:100',
'media_type': 'clip',
}, },
}] }]
@ -59,6 +60,7 @@ def _real_extract(self, url):
'url': f'https://www.youtube.com/watch?v={video_id}', 'url': f'https://www.youtube.com/watch?v={video_id}',
'ie_key': YoutubeIE.ie_key(), 'ie_key': YoutubeIE.ie_key(),
'id': clip_id, 'id': clip_id,
'media_type': 'clip',
'section_start': int(clip_data['startTimeMs']) / 1000, 'section_start': int(clip_data['startTimeMs']) / 1000,
'section_end': int(clip_data['endTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000,
'_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility

View File

@ -35,6 +35,7 @@ class YoutubeYtBeIE(YoutubeBaseInfoExtractor):
'duration': 59, 'duration': 59,
'comment_count': int, 'comment_count': int,
'channel_follower_count': int, 'channel_follower_count': int,
'media_type': 'short',
}, },
'params': { 'params': {
'noplaylist': True, 'noplaylist': True,

View File

@ -376,6 +376,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Afrojack', 'uploader': 'Afrojack',
'uploader_url': 'https://www.youtube.com/@Afrojack', 'uploader_url': 'https://www.youtube.com/@Afrojack',
'uploader_id': '@Afrojack', 'uploader_id': '@Afrojack',
'media_type': 'video',
}, },
'params': { 'params': {
'youtube_include_dash_manifest': True, 'youtube_include_dash_manifest': True,
@ -413,10 +414,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_is_verified': True, 'channel_is_verified': True,
'heatmap': 'count:100', 'heatmap': 'count:100',
'timestamp': 1401991663, 'timestamp': 1401991663,
'media_type': 'video',
}, },
}, },
{ {
'note': 'Age-gate video with embed allowed in public site', 'note': 'Formerly an age-gate video with embed allowed in public site',
'url': 'https://youtube.com/watch?v=HsUATh_Nc2U', 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U',
'info_dict': { 'info_dict': {
'id': 'HsUATh_Nc2U', 'id': 'HsUATh_Nc2U',
@ -424,8 +426,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Godzilla 2 (Official Video)', 'title': 'Godzilla 2 (Official Video)',
'description': 'md5:bf77e03fcae5529475e500129b05668a', 'description': 'md5:bf77e03fcae5529475e500129b05668a',
'upload_date': '20200408', 'upload_date': '20200408',
'age_limit': 18, 'age_limit': 0,
'availability': 'needs_auth', 'availability': 'public',
'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg',
'channel': 'FlyingKitty', 'channel': 'FlyingKitty',
'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg',
@ -443,8 +445,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@FlyingKitty900', 'uploader_id': '@FlyingKitty900',
'comment_count': int, 'comment_count': int,
'channel_is_verified': True, 'channel_is_verified': True,
'media_type': 'video',
}, },
'skip': 'Age-restricted; requires authentication',
}, },
{ {
'note': 'Age-gate video embedable only with clientScreen=EMBED', 'note': 'Age-gate video embedable only with clientScreen=EMBED',
@ -507,6 +509,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Herr Lurik', 'uploader': 'Herr Lurik',
'uploader_url': 'https://www.youtube.com/@HerrLurik', 'uploader_url': 'https://www.youtube.com/@HerrLurik',
'uploader_id': '@HerrLurik', 'uploader_id': '@HerrLurik',
'media_type': 'video',
}, },
}, },
{ {
@ -546,6 +549,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'deadmau5', 'uploader': 'deadmau5',
'uploader_url': 'https://www.youtube.com/@deadmau5', 'uploader_url': 'https://www.youtube.com/@deadmau5',
'uploader_id': '@deadmau5', 'uploader_id': '@deadmau5',
'media_type': 'video',
}, },
'expected_warnings': [ 'expected_warnings': [
'DASH manifest missing', 'DASH manifest missing',
@ -581,6 +585,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@Olympics', 'uploader_id': '@Olympics',
'channel_is_verified': True, 'channel_is_verified': True,
'timestamp': 1440707674, 'timestamp': 1440707674,
'media_type': 'livestream',
}, },
'params': { 'params': {
'skip_download': 'requires avconv', 'skip_download': 'requires avconv',
@ -615,6 +620,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@AllenMeow', 'uploader_url': 'https://www.youtube.com/@AllenMeow',
'uploader_id': '@AllenMeow', 'uploader_id': '@AllenMeow',
'timestamp': 1299776999, 'timestamp': 1299776999,
'media_type': 'video',
}, },
}, },
# url_encoded_fmt_stream_map is empty string # url_encoded_fmt_stream_map is empty string
@ -809,6 +815,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'like_count': int, 'like_count': int,
'age_limit': 0, 'age_limit': 0,
'channel_follower_count': int, 'channel_follower_count': int,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -868,6 +875,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@BKCHarvard', 'uploader_id': '@BKCHarvard',
'uploader_url': 'https://www.youtube.com/@BKCHarvard', 'uploader_url': 'https://www.youtube.com/@BKCHarvard',
'timestamp': 1422422076, 'timestamp': 1422422076,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -904,6 +912,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_is_verified': True, 'channel_is_verified': True,
'heatmap': 'count:100', 'heatmap': 'count:100',
'timestamp': 1447987198, 'timestamp': 1447987198,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -968,6 +977,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'comment_count': int, 'comment_count': int,
'channel_is_verified': True, 'channel_is_verified': True,
'timestamp': 1484761047, 'timestamp': 1484761047,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -1070,6 +1080,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'tags': 'count:11', 'tags': 'count:11',
'live_status': 'not_live', 'live_status': 'not_live',
'channel_follower_count': int, 'channel_follower_count': int,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -1124,6 +1135,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@ElevageOrVert', 'uploader_url': 'https://www.youtube.com/@ElevageOrVert',
'uploader_id': '@ElevageOrVert', 'uploader_id': '@ElevageOrVert',
'timestamp': 1497343210, 'timestamp': 1497343210,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -1163,6 +1175,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_is_verified': True, 'channel_is_verified': True,
'heatmap': 'count:100', 'heatmap': 'count:100',
'timestamp': 1377976349, 'timestamp': 1377976349,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -1207,6 +1220,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_follower_count': int, 'channel_follower_count': int,
'uploader': 'The Cinematic Orchestra', 'uploader': 'The Cinematic Orchestra',
'comment_count': int, 'comment_count': int,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -1275,6 +1289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124', 'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124',
'uploader_id': '@walkaroundjapan7124', 'uploader_id': '@walkaroundjapan7124',
'timestamp': 1605884416, 'timestamp': 1605884416,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -1371,6 +1386,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_is_verified': True, 'channel_is_verified': True,
'heatmap': 'count:100', 'heatmap': 'count:100',
'timestamp': 1395685455, 'timestamp': 1395685455,
'media_type': 'video',
}, 'params': {'format': 'mhtml', 'skip_download': True}, }, 'params': {'format': 'mhtml', 'skip_download': True},
}, { }, {
# Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939)
@ -1401,6 +1417,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@LeonNguyen', 'uploader_id': '@LeonNguyen',
'heatmap': 'count:100', 'heatmap': 'count:100',
'timestamp': 1641170939, 'timestamp': 1641170939,
'media_type': 'video',
}, },
}, { }, {
# date text is premiered video, ensure upload date in UTC (published 1641172509) # date text is premiered video, ensure upload date in UTC (published 1641172509)
@ -1434,6 +1451,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_is_verified': True, 'channel_is_verified': True,
'heatmap': 'count:100', 'heatmap': 'count:100',
'timestamp': 1641172509, 'timestamp': 1641172509,
'media_type': 'video',
}, },
}, },
{ # continuous livestream. { # continuous livestream.
@ -1495,6 +1513,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Lesmiscore', 'uploader': 'Lesmiscore',
'uploader_url': 'https://www.youtube.com/@lesmiscore', 'uploader_url': 'https://www.youtube.com/@lesmiscore',
'timestamp': 1648005313, 'timestamp': 1648005313,
'media_type': 'short',
}, },
}, { }, {
# Prefer primary title+description language metadata by default # Prefer primary title+description language metadata by default
@ -1523,6 +1542,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@coletdjnz', 'uploader_id': '@coletdjnz',
'uploader': 'cole-dlp-test-acc', 'uploader': 'cole-dlp-test-acc',
'timestamp': 1662677394, 'timestamp': 1662677394,
'media_type': 'video',
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, { }, {
@ -1551,6 +1571,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'cole-dlp-test-acc', 'uploader': 'cole-dlp-test-acc',
'timestamp': 1659073275, 'timestamp': 1659073275,
'like_count': int, 'like_count': int,
'media_type': 'video',
}, },
'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}}, 'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}},
'expected_warnings': [r'Preferring "fr" translated fields'], 'expected_warnings': [r'Preferring "fr" translated fields'],
@ -1587,6 +1608,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'comment_count': int, 'comment_count': int,
'channel_is_verified': True, 'channel_is_verified': True,
'heatmap': 'count:100', 'heatmap': 'count:100',
'media_type': 'video',
}, },
'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'},
}, { }, {
@ -1687,6 +1709,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'comment_count': int, 'comment_count': int,
'channel_is_verified': True, 'channel_is_verified': True,
'heatmap': 'count:100', 'heatmap': 'count:100',
'media_type': 'video',
}, },
'params': { 'params': {
'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}},
@ -1719,6 +1742,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_follower_count': int, 'channel_follower_count': int,
'categories': ['People & Blogs'], 'categories': ['People & Blogs'],
'tags': [], 'tags': [],
'media_type': 'short',
}, },
}, },
] ]
@ -1754,6 +1778,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@ChristopherSykesDocumentaries', 'uploader_id': '@ChristopherSykesDocumentaries',
'heatmap': 'count:100', 'heatmap': 'count:100',
'timestamp': 1211825920, 'timestamp': 1211825920,
'media_type': 'video',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -3787,7 +3812,10 @@ def is_bad_format(fmt):
'tags': keywords, 'tags': keywords,
'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
'live_status': live_status, 'live_status': live_status,
'media_type': 'livestream' if get_first(video_details, 'isLiveContent') else None, 'media_type': (
'livestream' if get_first(video_details, 'isLiveContent')
else 'short' if get_first(microformats, 'isShortsEligible')
else 'video'),
'release_timestamp': live_start_time, 'release_timestamp': live_start_time,
'_format_sort_fields': ( # source_preference is lower for potentially damaged formats '_format_sort_fields': ( # source_preference is lower for potentially damaged formats
'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'), 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'),