1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2026-01-22 14:51:23 +00:00

Merge branch 'master' of github.com:yt-dlp/yt-dlp

This commit is contained in:
lonm
2025-01-13 14:49:20 +00:00
209 changed files with 6365 additions and 3670 deletions

View File

@@ -208,6 +208,10 @@ from .bandcamp import (
BandcampUserIE,
BandcampWeeklyIE,
)
from .bandlab import (
BandlabIE,
BandlabPlaylistIE,
)
from .bannedvideo import BannedVideoIE
from .bbc import (
BBCIE,
@@ -278,6 +282,7 @@ from .bleacherreport import (
from .blerp import BlerpIE
from .blogger import BloggerIE
from .bloomberg import BloombergIE
from .bluesky import BlueskyIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
from .boosty import BoostyIE
@@ -363,7 +368,10 @@ from .ccc import (
)
from .ccma import CCMAIE
from .cctv import CCTVIE
from .cda import CDAIE
from .cda import (
CDAIE,
CDAFolderIE,
)
from .cellebrite import CellebriteIE
from .ceskatelevize import CeskaTelevizeIE
from .cgtn import CGTNIE
@@ -398,8 +406,6 @@ from .cmt import CMTIE
from .cnbc import CNBCVideoIE
from .cnn import (
CNNIE,
CNNArticleIE,
CNNBlogsIE,
CNNIndonesiaIE,
)
from .comedycentral import (
@@ -549,6 +555,7 @@ from .dropout import (
DropoutIE,
DropoutSeasonIE,
)
from .drtalks import DrTalksIE
from .drtuber import DrTuberIE
from .drtv import (
DRTVIE,
@@ -706,6 +713,7 @@ from .gab import (
GabTVIE,
)
from .gaia import GaiaIE
from .gamedevtv import GameDevTVDashboardIE
from .gamejolt import (
GameJoltCommunityIE,
GameJoltGameIE,
@@ -939,6 +947,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import (
KhanAcademyIE,
KhanAcademyUnitIE,
@@ -1128,12 +1140,6 @@ from .microsoftembed import (
MicrosoftMediusIE,
)
from .microsoftstream import MicrosoftStreamIE
from .mildom import (
MildomClipIE,
MildomIE,
MildomUserVodIE,
MildomVodIE,
)
from .minds import (
MindsChannelIE,
MindsGroupIE,
@@ -1153,6 +1159,7 @@ from .mitele import MiTeleIE
from .mixch import (
MixchArchiveIE,
MixchIE,
MixchMovieIE,
)
from .mixcloud import (
MixcloudIE,
@@ -1514,8 +1521,8 @@ from .pgatour import PGATourIE
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
from .pialive import PiaLiveIE
from .piapro import PiaproIE
from .piaulizaportal import PIAULIZAPortalIE
from .picarto import (
PicartoIE,
PicartoVodIE,
@@ -1545,16 +1552,13 @@ from .pluralsight import (
PluralsightIE,
)
from .plutotv import PlutoTVIE
from .plvideo import PlVideoIE
from .podbayfm import (
PodbayFMChannelIE,
PodbayFMIE,
)
from .podchaser import PodchaserIE
from .podomatic import PodomaticIE
from .pokemon import (
PokemonIE,
PokemonWatchIE,
)
from .pokergo import (
PokerGoCollectionIE,
PokerGoIE,
@@ -1645,6 +1649,7 @@ from .radiokapital import (
RadioKapitalIE,
RadioKapitalShowIE,
)
from .radioradicale import RadioRadicaleIE
from .radiozet import RadioZetPodcastIE
from .radlive import (
RadLiveChannelIE,
@@ -1936,9 +1941,7 @@ from .spotify import (
)
from .spreaker import (
SpreakerIE,
SpreakerPageIE,
SpreakerShowIE,
SpreakerShowPageIE,
)
from .springboardplatform import SpringboardPlatformIE
from .sprout import SproutIE
@@ -2249,6 +2252,10 @@ from .ufctv import (
)
from .ukcolumn import UkColumnIE
from .uktvplay import UKTVPlayIE
from .uliza import (
UlizaPlayerIE,
UlizaPortalIE,
)
from .umg import UMGDeIE
from .unistra import UnistraIE
from .unity import UnityIE
@@ -2277,10 +2284,6 @@ from .utreon import UtreonIE
from .varzesh3 import Varzesh3IE
from .vbox7 import Vbox7IE
from .veo import VeoIE
from .veoh import (
VeohIE,
VeohUserIE,
)
from .vesti import VestiIE
from .vevo import (
VevoIE,
@@ -2353,10 +2356,6 @@ from .vimm import (
VimmIE,
VimmRecordingIE,
)
from .vine import (
VineIE,
VineUserIE,
)
from .viously import ViouslyIE
from .viqeo import ViqeoIE
from .viu import (

View File

@@ -6,7 +6,6 @@ import hmac
import io
import json
import re
import struct
import time
import urllib.parse
import uuid
@@ -18,10 +17,8 @@ from ..networking.exceptions import TransportError
from ..utils import (
ExtractorError,
OnDemandPagedList,
bytes_to_intlist,
decode_base_n,
int_or_none,
intlist_to_bytes,
time_seconds,
traverse_obj,
update_url_query,
@@ -72,15 +69,15 @@ class AbemaLicenseRH(RequestHandler):
})
res = decode_base_n(license_response['k'], table=self._STRTABLE)
encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
encvideokey = list(res.to_bytes(16, 'big'))
h = hmac.new(
binascii.unhexlify(self._HKEY),
(license_response['cid'] + self.ie._DEVICE_ID).encode(),
digestmod=hashlib.sha256)
enckey = bytes_to_intlist(h.digest())
enckey = list(h.digest())
return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
return bytes(aes_ecb_decrypt(encvideokey, enckey))
class AbemaTVBaseIE(InfoExtractor):

View File

@@ -11,11 +11,9 @@ from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
ass_subtitles_timecode,
bytes_to_intlist,
bytes_to_long,
float_or_none,
int_or_none,
intlist_to_bytes,
join_nonempty,
long_to_bytes,
parse_iso8601,
@@ -198,16 +196,16 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link')
self._K = ''.join(random.choices('0123456789abcdef', k=16))
message = bytes_to_intlist(json.dumps({
message = list(json.dumps({
'k': self._K,
't': token,
}))
}).encode())
# Sometimes authentication fails for no good reason, retry with
# a different random padding
links_data = None
for _ in range(3):
padded_message = intlist_to_bytes(pkcs1pad(message, 128))
padded_message = bytes(pkcs1pad(message, 128))
n, e = self._RSA_KEY
encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n))
authorization = base64.b64encode(encrypted_message).decode()
@@ -234,7 +232,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
error = self._parse_json(e.cause.response.read(), video_id)
message = error.get('message')
if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country':
if e.cause.status == 403 and error.get('code') == 'player-bad-geolocation-country':
self.raise_geo_restricted(msg=message)
raise ExtractorError(message)
else:

View File

@@ -1362,7 +1362,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
def _download_webpage_handle(self, *args, **kwargs):
headers = self.geo_verification_headers()
headers.update(kwargs.get('headers', {}))
headers.update(kwargs.get('headers') or {})
kwargs['headers'] = headers
return super()._download_webpage_handle(
*args, **kwargs)

View File

@@ -33,21 +33,21 @@ class AfreecaTVBaseIE(InfoExtractor):
}
response = self._download_json(
'https://login.afreecatv.com/app/LoginAction.php', None,
'https://login.sooplive.co.kr/app/LoginAction.php', None,
'Logging in', data=urlencode_postdata(login_form))
_ERRORS = {
-4: 'Your account has been suspended due to a violation of our terms and policies.',
-5: 'https://member.afreecatv.com/app/user_delete_progress.php',
-6: 'https://login.afreecatv.com/membership/changeMember.php',
-8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
-9: 'https://member.afreecatv.com/app/pop_login_block.php',
-11: 'https://login.afreecatv.com/afreeca/second_login.php',
-12: 'https://member.afreecatv.com/app/user_security.php',
-5: 'https://member.sooplive.co.kr/app/user_delete_progress.php',
-6: 'https://login.sooplive.co.kr/membership/changeMember.php',
-8: "Hello! Soop here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
-9: 'https://member.sooplive.co.kr/app/pop_login_block.php',
-11: 'https://login.sooplive.co.kr/afreeca/second_login.php',
-12: 'https://member.sooplive.co.kr/app/user_security.php',
0: 'The username does not exist or you have entered the wrong password.',
-1: 'The username does not exist or you have entered the wrong password.',
-3: 'You have entered your username/password incorrectly.',
-7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
-7: 'You cannot use your Global Soop account to access Korean Soop.',
-10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
-32008: 'You have failed to log in. Please contact our Help Center.',
}
@@ -61,76 +61,48 @@ class AfreecaTVBaseIE(InfoExtractor):
def _call_api(self, endpoint, display_id, data=None, headers=None, query=None):
return self._download_json(Request(
f'https://api.m.afreecatv.com/{endpoint}',
f'https://api.m.sooplive.co.kr/{endpoint}',
data=data, headers=headers, query=query,
extensions={'legacy_ssl': True}), display_id,
'Downloading API JSON', 'Unable to download API JSON')
@staticmethod
def _fixup_thumb(thumb_url):
if not url_or_none(thumb_url):
return None
# Core would determine_ext as 'php' from the url, so we need to provide the real ext
# See: https://github.com/yt-dlp/yt-dlp/issues/11537
return [{'url': thumb_url, 'ext': 'jpg'}]
class AfreecaTVIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv'
IE_DESC = 'afreecatv.com'
_VALID_URL = r'''(?x)
https?://
(?:
(?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
(?:
/app/(?:index|read_ucc_bbs)\.cgi|
/player/[Pp]layer\.(?:swf|html)
)\?.*?\bnTitleNo=|
vod\.afreecatv\.com/(PLAYER/STATION|player)/
)
(?P<id>\d+)/?(?:$|[?#&])
'''
IE_NAME = 'soop'
IE_DESC = 'sooplive.co.kr'
_VALID_URL = r'https?://vod\.(?:sooplive\.co\.kr|afreecatv\.com)/(?:PLAYER/STATION|player)/(?P<id>\d+)/?(?:$|[?#&])'
_TESTS = [{
'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
'url': 'https://vod.sooplive.co.kr/player/96753363',
'info_dict': {
'id': '36164052',
'id': '20230108_9FF5BEE1_244432674_1',
'ext': 'mp4',
'title': '데일리 에이프릴 요정들의 시상식!',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160503',
'uploader_id': 'rlantnghks',
'uploader': '페이즈으',
'duration': 10840,
'thumbnail': r're:https?://videoimg\.sooplive\.co/.kr/.+',
'upload_date': '20230108',
'timestamp': 1673218805,
'title': '젠지 페이즈',
},
'skip': 'Video is gone',
}, {
'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
'info_dict': {
'id': '36153164',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'params': {
'skip_download': True,
},
'playlist_count': 2,
'playlist': [{
'md5': 'd8b7c174568da61d774ef0203159bf97',
'info_dict': {
'id': '36153164_1',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'upload_date': '20160502',
},
}, {
'md5': '58f2ce7f6044e34439ab2d50612ab02b',
'info_dict': {
'id': '36153164_2',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'upload_date': '20160502',
},
}],
'skip': 'Video is gone',
}, {
# non standard key
'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605',
'url': 'http://vod.sooplive.co.kr/PLAYER/STATION/20515605',
'info_dict': {
'id': '20170411_BE689A0E_190960999_1_2_h',
'ext': 'mp4',
'title': '혼자사는여자집',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+',
'uploader': '♥이슬이',
'uploader_id': 'dasl8121',
'upload_date': '20170411',
@@ -142,12 +114,12 @@ class AfreecaTVIE(AfreecaTVBaseIE):
},
}, {
# adult content
'url': 'https://vod.afreecatv.com/player/97267690',
'url': 'https://vod.sooplive.co.kr/player/97267690',
'info_dict': {
'id': '20180327_27901457_202289533_1',
'ext': 'mp4',
'title': '[생]빨개요♥ (part 1)',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+',
'uploader': '[SA]서아',
'uploader_id': 'bjdyrksu',
'upload_date': '20180327',
@@ -157,36 +129,17 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'skip_download': True,
},
'skip': 'The VOD does not exist',
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
'only_matching': True,
}, {
'url': 'https://vod.afreecatv.com/player/96753363',
'info_dict': {
'id': '20230108_9FF5BEE1_244432674_1',
'ext': 'mp4',
'uploader_id': 'rlantnghks',
'uploader': '페이즈으',
'duration': 10840,
'thumbnail': r're:https?://videoimg\.afreecatv\.com/.+',
'upload_date': '20230108',
'timestamp': 1673218805,
'title': '젠지 페이즈',
},
'params': {
'skip_download': True,
},
}, {
# adult content
'url': 'https://vod.afreecatv.com/player/70395877',
'url': 'https://vod.sooplive.co.kr/player/70395877',
'only_matching': True,
}, {
# subscribers only
'url': 'https://vod.afreecatv.com/player/104647403',
'url': 'https://vod.sooplive.co.kr/player/104647403',
'only_matching': True,
}, {
# private
'url': 'https://vod.afreecatv.com/player/81669846',
'url': 'https://vod.sooplive.co.kr/player/81669846',
'only_matching': True,
}]
@@ -209,8 +162,8 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'title': ('title', {str}),
'uploader': ('writer_nick', {str}),
'uploader_id': ('bj_id', {str}),
'duration': ('total_file_duration', {functools.partial(int_or_none, scale=1000)}),
'thumbnail': ('thumb', {url_or_none}),
'duration': ('total_file_duration', {int_or_none(scale=1000)}),
'thumbnails': ('thumb', {self._fixup_thumb}),
})
entries = []
@@ -233,7 +186,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'title': f'{common_info.get("title") or "Untitled"} (part {file_num})',
'formats': formats,
**traverse_obj(file_element, {
'duration': ('duration', {functools.partial(int_or_none, scale=1000)}),
'duration': ('duration', {int_or_none(scale=1000)}),
'timestamp': ('file_start', {unified_timestamp}),
}),
})
@@ -262,11 +215,11 @@ class AfreecaTVIE(AfreecaTVBaseIE):
class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv:catchstory'
IE_DESC = 'afreecatv.com catch story'
_VALID_URL = r'https?://vod\.afreecatv\.com/player/(?P<id>\d+)/catchstory'
IE_NAME = 'soop:catchstory'
IE_DESC = 'sooplive.co.kr catch story'
_VALID_URL = r'https?://vod\.(?:sooplive\.co\.kr|afreecatv\.com)/player/(?P<id>\d+)/catchstory'
_TESTS = [{
'url': 'https://vod.afreecatv.com/player/103247/catchstory',
'url': 'https://vod.sooplive.co.kr/player/103247/catchstory',
'info_dict': {
'id': '103247',
},
@@ -281,29 +234,28 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
return self.playlist_result(self._entries(data), video_id)
@staticmethod
def _entries(data):
def _entries(self, data):
# 'files' is always a list with 1 element
yield from traverse_obj(data, (
'data', lambda _, v: v['story_type'] == 'catch',
'catch_list', lambda _, v: v['files'][0]['file'], {
'id': ('files', 0, 'file_info_key', {str}),
'url': ('files', 0, 'file', {url_or_none}),
'duration': ('files', 0, 'duration', {functools.partial(int_or_none, scale=1000)}),
'duration': ('files', 0, 'duration', {int_or_none(scale=1000)}),
'title': ('title', {str}),
'uploader': ('writer_nick', {str}),
'uploader_id': ('writer_id', {str}),
'thumbnail': ('thumb', {url_or_none}),
'thumbnails': ('thumb', {self._fixup_thumb}),
'timestamp': ('write_timestamp', {int_or_none}),
}))
class AfreecaTVLiveIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv:live'
IE_DESC = 'afreecatv.com livestreams'
_VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?'
IE_NAME = 'soop:live'
IE_DESC = 'sooplive.co.kr livestreams'
_VALID_URL = r'https?://play\.(?:sooplive\.co\.kr|afreecatv\.com)/(?P<id>[^/?#]+)(?:/(?P<bno>\d+))?'
_TESTS = [{
'url': 'https://play.afreecatv.com/pyh3646/237852185',
'url': 'https://play.sooplive.co.kr/pyh3646/237852185',
'info_dict': {
'id': '237852185',
'ext': 'mp4',
@@ -315,30 +267,30 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
},
'skip': 'Livestream has ended',
}, {
'url': 'https://play.afreecatv.com/pyh3646/237852185',
'url': 'https://play.sooplive.co.kr/pyh3646/237852185',
'only_matching': True,
}, {
'url': 'https://play.afreecatv.com/pyh3646',
'url': 'https://play.sooplive.co.kr/pyh3646',
'only_matching': True,
}]
_LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php'
_LIVE_API_URL = 'https://live.sooplive.co.kr/afreeca/player_live_api.php'
_WORKING_CDNS = [
'gcp_cdn', # live-global-cdn-v02.afreecatv.com
'gs_cdn_pc_app', # pc-app.stream.afreecatv.com
'gs_cdn_mobile_web', # mobile-web.stream.afreecatv.com
'gs_cdn_pc_web', # pc-web.stream.afreecatv.com
'gcp_cdn', # live-global-cdn-v02.sooplive.co.kr
'gs_cdn_pc_app', # pc-app.stream.sooplive.co.kr
'gs_cdn_mobile_web', # mobile-web.stream.sooplive.co.kr
'gs_cdn_pc_web', # pc-web.stream.sooplive.co.kr
]
_BAD_CDNS = [
'gs_cdn', # chromecast.afreeca.gscdn.com (cannot resolve)
'gs_cdn_chromecast', # chromecast.stream.afreecatv.com (HTTP Error 400)
'azure_cdn', # live-global-cdn-v01.afreecatv.com (cannot resolve)
'aws_cf', # live-global-cdn-v03.afreecatv.com (cannot resolve)
'kt_cdn', # kt.stream.afreecatv.com (HTTP Error 400)
'gs_cdn_chromecast', # chromecast.stream.sooplive.co.kr (HTTP Error 400)
'azure_cdn', # live-global-cdn-v01.sooplive.co.kr (cannot resolve)
'aws_cf', # live-global-cdn-v03.sooplive.co.kr (cannot resolve)
'kt_cdn', # kt.stream.sooplive.co.kr (HTTP Error 400)
]
def _extract_formats(self, channel_info, broadcast_no, aid):
stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.sooplive.co.kr'
# If user has not passed CDN IDs, try API-provided CDN ID followed by other working CDN IDs
default_cdn_ids = orderedSet([
@@ -358,7 +310,7 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
try:
return self._extract_m3u8_formats(
m3u8_url, broadcast_no, 'mp4', m3u8_id='hls', query={'aid': aid},
headers={'Referer': 'https://play.afreecatv.com/'})
headers={'Referer': 'https://play.sooplive.co.kr/'})
except ExtractorError as e:
if attempt == len(cdn_ids):
raise
@@ -374,7 +326,13 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
broadcaster_id = channel_info.get('BJID') or broadcaster_id
broadcast_no = channel_info.get('BNO') or broadcast_no
if not broadcast_no:
raise UserNotLive(video_id=broadcaster_id)
result = channel_info.get('RESULT')
if result == 0:
raise UserNotLive(video_id=broadcaster_id)
elif result == -6:
self.raise_login_required(
'This channel is streaming for subscribers only', method='password')
raise ExtractorError('Unable to extract broadcast number')
password = self.get_param('videopassword')
if channel_info.get('BPWD') == 'Y' and password is None:
@@ -403,7 +361,7 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
formats = self._extract_formats(channel_info, broadcast_no, aid)
station_info = traverse_obj(self._download_json(
'https://st.afreecatv.com/api/get_station_status.php', broadcast_no,
'https://st.sooplive.co.kr/api/get_station_status.php', broadcast_no,
'Downloading channel metadata', 'Unable to download channel metadata',
query={'szBjId': broadcaster_id}, fatal=False), {dict}) or {}
@@ -419,11 +377,11 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
}
class AfreecaTVUserIE(InfoExtractor):
IE_NAME = 'afreecatv:user'
_VALID_URL = r'https?://bj\.afreeca(?:tv)?\.com/(?P<id>[^/]+)/vods/?(?P<slug_type>[^/]+)?'
class AfreecaTVUserIE(AfreecaTVBaseIE):
IE_NAME = 'soop:user'
_VALID_URL = r'https?://ch\.(?:sooplive\.co\.kr|afreecatv\.com)/(?P<id>[^/?#]+)/vods/?(?P<slug_type>[^/?#]+)?'
_TESTS = [{
'url': 'https://bj.afreecatv.com/ryuryu24/vods/review',
'url': 'https://ch.sooplive.co.kr/ryuryu24/vods/review',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
@@ -431,7 +389,7 @@ class AfreecaTVUserIE(InfoExtractor):
},
'playlist_count': 218,
}, {
'url': 'https://bj.afreecatv.com/parang1995/vods/highlight',
'url': 'https://ch.sooplive.co.kr/parang1995/vods/highlight',
'info_dict': {
'_type': 'playlist',
'id': 'parang1995',
@@ -439,7 +397,7 @@ class AfreecaTVUserIE(InfoExtractor):
},
'playlist_count': 997,
}, {
'url': 'https://bj.afreecatv.com/ryuryu24/vods',
'url': 'https://ch.sooplive.co.kr/ryuryu24/vods',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
@@ -447,7 +405,7 @@ class AfreecaTVUserIE(InfoExtractor):
},
'playlist_count': 221,
}, {
'url': 'https://bj.afreecatv.com/ryuryu24/vods/balloonclip',
'url': 'https://ch.sooplive.co.kr/ryuryu24/vods/balloonclip',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
@@ -459,12 +417,12 @@ class AfreecaTVUserIE(InfoExtractor):
def _fetch_page(self, user_id, user_type, page):
page += 1
info = self._download_json(f'https://bjapi.afreecatv.com/api/{user_id}/vods/{user_type}', user_id,
info = self._download_json(f'https://chapi.sooplive.co.kr/api/{user_id}/vods/{user_type}', user_id,
query={'page': page, 'per_page': self._PER_PAGE, 'orderby': 'reg_date'},
note=f'Downloading {user_type} video page {page}')
for item in info['data']:
yield self.url_result(
f'https://vod.afreecatv.com/player/{item["title_no"]}/', AfreecaTVIE, item['title_no'])
f'https://vod.sooplive.co.kr/player/{item["title_no"]}/', AfreecaTVIE, item['title_no'])
def _real_extract(self, url):
user_id, user_type = self._match_valid_url(url).group('id', 'slug_type')

View File

@@ -71,7 +71,7 @@ class AllstarBaseIE(InfoExtractor):
'thumbnails': (('clipImageThumb', 'clipImageSource'), {'url': {media_url_or_none}}),
'duration': ('clipLength', {int_or_none}),
'filesize': ('clipSizeBytes', {int_or_none}),
'timestamp': ('createdDate', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('createdDate', {int_or_none(scale=1000)}),
'uploader': ('username', {str}),
'uploader_id': ('user', '_id', {str}),
'view_count': ('views', {int_or_none}),

View File

@@ -8,10 +8,8 @@ import time
from .common import InfoExtractor
from ..aes import aes_encrypt
from ..utils import (
bytes_to_intlist,
determine_ext,
int_or_none,
intlist_to_bytes,
join_nonempty,
smuggle_url,
strip_jsonp,
@@ -33,24 +31,6 @@ class AnvatoIE(InfoExtractor):
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js
_TESTS = [{
# from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14
'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441',
'md5': '921919dab3cd0b849ff3d624831ae3e2',
'info_dict': {
'id': '899441',
'ext': 'mp4',
'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14',
'description': 'md5:85e05a3cc163f8c344340f220521136d',
'upload_date': '20201215',
'timestamp': 1608009755,
'thumbnail': r're:^https?://.*\.jpg',
'uploader': 'NFL',
'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights',
'Player Highlights', 'Cleveland Browns', 'league'],
'duration': 157,
'categories': ['Entertainment', 'Game', 'Highlights'],
},
}, {
# from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/
'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455',
'md5': '837718bcfb3a7778d022f857f7a9b19e',
@@ -241,31 +221,6 @@ class AnvatoIE(InfoExtractor):
'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582',
}
def _generate_nfl_token(self, anvack, mcp_id):
reroute = self._download_json(
'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials',
headers={'X-Domain-Id': 100}, note='Fetching token info')
token_type = reroute.get('token_type') or 'Bearer'
auth_token = f'{token_type} {reroute["access_token"]}'
response = self._download_json(
'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
'query': '''{
viewer {
mediaToken(anvack: "%s", id: %s) {
token
}
}
}''' % (anvack, mcp_id), # noqa: UP031
}).encode(), headers={
'Authorization': auth_token,
'Content-Type': 'application/json',
}, note='Fetching NFL API token')
return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token'))
_TOKEN_GENERATORS = {
'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token,
}
def _server_time(self, access_key, video_id):
return int_or_none(traverse_obj(self._download_json(
f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key},
@@ -277,8 +232,8 @@ class AnvatoIE(InfoExtractor):
server_time = self._server_time(access_key, video_id)
input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}'
auth_secret = intlist_to_bytes(aes_encrypt(
bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
auth_secret = bytes(aes_encrypt(
list(input_data[:64].encode()), list(self._AUTH_KEY)))
query = {
'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'),
'rtyp': 'fp',
@@ -290,8 +245,6 @@ class AnvatoIE(InfoExtractor):
}
if extracted_token is not None:
api['anvstk2'] = extracted_token
elif self._TOKEN_GENERATORS.get(access_key) is not None:
api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id)
elif self._ANVACK_TABLE.get(access_key) is not None:
api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}')
else:

View File

@@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
},
},
],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}]
@staticmethod
@@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({
'id': review.get('review_id'),
'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'})

View File

@@ -299,7 +299,7 @@ class ARDBetaMediathekIE(InfoExtractor):
'info_dict': {
'id': '94834686',
'ext': 'mp4',
'duration': 2700,
'duration': 2670,
'episode': '7 Tage ... unter harten Jungs',
'description': 'md5:0f215470dcd2b02f59f4bd10c963f072',
'upload_date': '20231005',
@@ -307,10 +307,28 @@ class ARDBetaMediathekIE(InfoExtractor):
'display_id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
'series': '7 Tage ...',
'channel': 'HR',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:430c86d233afa42d?w=960&ch=fa32ba69bc87989a',
'title': '7 Tage ... unter harten Jungs',
'_old_archive_ids': ['ardbetamediathek N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3'],
},
}, {
'url': 'https://www.ardmediathek.de/video/lokalzeit-aus-duesseldorf/lokalzeit-aus-duesseldorf-oder-31-10-2024/wdr-duesseldorf/Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz',
'info_dict': {
'id': '13847165',
'chapters': 'count:8',
'ext': 'mp4',
'channel': 'WDR',
'display_id': 'Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz',
'episode': 'Lokalzeit aus Düsseldorf | 31.10.2024',
'series': 'Lokalzeit aus Düsseldorf',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f02ec9bd9b7bd5f6?w=960&ch=612491dcd5e09b0c',
'title': 'Lokalzeit aus Düsseldorf | 31.10.2024',
'upload_date': '20241031',
'timestamp': 1730399400,
'description': 'md5:12db30b3b706314efe3778b8df1a7058',
'duration': 1759,
'_old_archive_ids': ['ardbetamediathek Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz'],
},
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'only_matching': True,
@@ -455,6 +473,12 @@ class ARDBetaMediathekIE(InfoExtractor):
'subtitles': subtitles,
'is_live': is_live,
'age_limit': age_limit,
**traverse_obj(media_data, {
'chapters': ('pluginData', 'jumpmarks@all', 'chapterArray', lambda _, v: int_or_none(v['chapterTime']), {
'start_time': ('chapterTime', {int_or_none}),
'title': ('chapterTitle', {str}),
}),
}),
**traverse_obj(media_data, ('meta', {
'title': 'title',
'description': 'synopsis',

View File

@@ -1,4 +1,3 @@
import functools
import json
import random
import re
@@ -10,7 +9,6 @@ from ..utils import (
ExtractorError,
extract_attributes,
float_or_none,
get_element_html_by_id,
int_or_none,
parse_filesize,
str_or_none,
@@ -21,7 +19,7 @@ from ..utils import (
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
from ..utils.traversal import find_element, traverse_obj
class BandcampIE(InfoExtractor):
@@ -45,6 +43,8 @@ class BandcampIE(InfoExtractor):
'uploader_url': 'https://youtube-dl.bandcamp.com',
'uploader_id': 'youtube-dl',
'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
'artists': ['youtube-dl "\'/\\ä↭'],
'album_artists': ['youtube-dl "\'/\\ä↭'],
},
'skip': 'There is a limit of 200 free downloads / month for the test song',
}, {
@@ -271,6 +271,18 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
'timestamp': 1311756226,
'upload_date': '20110727',
'uploader': 'Blazo',
'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
'album_artists': ['Blazo'],
'uploader_url': 'https://blazo.bandcamp.com',
'release_date': '20110727',
'release_timestamp': 1311724800.0,
'track': 'Intro',
'uploader_id': 'blazo',
'track_number': 1,
'album': 'Jazz Format Mixtape vol.1',
'artists': ['Blazo'],
'duration': 19.335,
'track_id': '1353101989',
},
},
{
@@ -282,6 +294,18 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
'timestamp': 1311757238,
'upload_date': '20110727',
'uploader': 'Blazo',
'track': 'Kero One - Keep It Alive (Blazo remix)',
'release_date': '20110727',
'track_id': '38097443',
'track_number': 2,
'duration': 181.467,
'uploader_url': 'https://blazo.bandcamp.com',
'album': 'Jazz Format Mixtape vol.1',
'uploader_id': 'blazo',
'album_artists': ['Blazo'],
'artists': ['Blazo'],
'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
'release_timestamp': 1311724800.0,
},
},
],
@@ -289,6 +313,7 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
'title': 'Jazz Format Mixtape vol.1',
'id': 'jazz-format-mixtape-vol-1',
'uploader_id': 'blazo',
'description': 'md5:38052a93217f3ffdc033cd5dbbce2989',
},
'params': {
'playlistend': 2,
@@ -363,10 +388,10 @@ class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
'url': 'https://bandcamp.com/?show=224',
'md5': 'b00df799c733cf7e0c567ed187dea0fd',
'md5': '61acc9a002bed93986b91168aa3ab433',
'info_dict': {
'id': '224',
'ext': 'opus',
'ext': 'mp3',
'title': 'BC Weekly April 4th 2017 - Magic Moments',
'description': 'md5:5d48150916e8e02d030623a48512c874',
'duration': 5829.77,
@@ -376,7 +401,7 @@ class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
'episode_id': '224',
},
'params': {
'format': 'opus-lo',
'format': 'mp3-128',
},
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
@@ -484,7 +509,7 @@ class BandcampUserIE(InfoExtractor):
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
yield from traverse_obj(webpage, (
{functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes},
{find_element(id='music-grid', html=True)}, {extract_attributes},
'data-client-items', {json.loads}, ..., 'page_url', {str}))
def _real_extract(self, url):
@@ -493,4 +518,4 @@ class BandcampUserIE(InfoExtractor):
return self.playlist_from_matches(
self._yield_items(webpage), uploader, f'Discography of {uploader}',
getter=functools.partial(urljoin, url))
getter=urljoin(url))

437
yt_dlp/extractor/bandlab.py Normal file
View File

@@ -0,0 +1,437 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
format_field,
int_or_none,
parse_iso8601,
parse_qs,
truncate_string,
url_or_none,
)
from ..utils.traversal import traverse_obj, value
class BandlabBaseIE(InfoExtractor):
def _call_api(self, endpoint, asset_id, **kwargs):
headers = kwargs.pop('headers', None) or {}
return self._download_json(
f'https://www.bandlab.com/api/v1.3/{endpoint}/{asset_id}',
asset_id, headers={
'accept': 'application/json',
'referer': 'https://www.bandlab.com/',
'x-client-id': 'BandLab-Web',
'x-client-version': '10.1.124',
**headers,
}, **kwargs)
def _parse_revision(self, revision_data, url=None):
return {
'vcodec': 'none',
'media_type': 'revision',
'extractor_key': BandlabIE.ie_key(),
'extractor': BandlabIE.IE_NAME,
**traverse_obj(revision_data, {
'webpage_url': (
'id', ({value(url)}, {format_field(template='https://www.bandlab.com/revision/%s')}), filter, any),
'id': (('revisionId', 'id'), {str}, any),
'title': ('song', 'name', {str}),
'track': ('song', 'name', {str}),
'url': ('mixdown', 'file', {url_or_none}),
'thumbnail': ('song', 'picture', 'url', {url_or_none}),
'description': ('description', {str}),
'uploader': ('creator', 'name', {str}),
'uploader_id': ('creator', 'username', {str}),
'timestamp': ('createdOn', {parse_iso8601}),
'duration': ('mixdown', 'duration', {float_or_none}),
'view_count': ('counters', 'plays', {int_or_none}),
'like_count': ('counters', 'likes', {int_or_none}),
'comment_count': ('counters', 'comments', {int_or_none}),
'genres': ('genres', ..., 'name', {str}),
}),
}
def _parse_track(self, track_data, url=None):
return {
'vcodec': 'none',
'media_type': 'track',
'extractor_key': BandlabIE.ie_key(),
'extractor': BandlabIE.IE_NAME,
**traverse_obj(track_data, {
'webpage_url': (
'id', ({value(url)}, {format_field(template='https://www.bandlab.com/post/%s')}), filter, any),
'id': (('revisionId', 'id'), {str}, any),
'url': ('track', 'sample', 'audioUrl', {url_or_none}),
'title': ('track', 'name', {str}),
'track': ('track', 'name', {str}),
'description': ('caption', {str}),
'thumbnail': ('track', 'picture', ('original', 'url'), {url_or_none}, any),
'view_count': ('counters', 'plays', {int_or_none}),
'like_count': ('counters', 'likes', {int_or_none}),
'comment_count': ('counters', 'comments', {int_or_none}),
'duration': ('track', 'sample', 'duration', {float_or_none}),
'uploader': ('creator', 'name', {str}),
'uploader_id': ('creator', 'username', {str}),
'timestamp': ('createdOn', {parse_iso8601}),
}),
}
def _parse_video(self, video_data, url=None):
return {
'media_type': 'video',
'extractor_key': BandlabIE.ie_key(),
'extractor': BandlabIE.IE_NAME,
**traverse_obj(video_data, {
'id': ('id', {str}),
'webpage_url': (
'id', ({value(url)}, {format_field(template='https://www.bandlab.com/post/%s')}), filter, any),
'url': ('video', 'url', {url_or_none}),
'title': ('caption', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}),
'description': ('caption', {str}),
'thumbnail': ('video', 'picture', 'url', {url_or_none}),
'view_count': ('video', 'counters', 'plays', {int_or_none}),
'like_count': ('video', 'counters', 'likes', {int_or_none}),
'comment_count': ('counters', 'comments', {int_or_none}),
'duration': ('video', 'duration', {float_or_none}),
'uploader': ('creator', 'name', {str}),
'uploader_id': ('creator', 'username', {str}),
}),
}
class BandlabIE(BandlabBaseIE):
_VALID_URL = [
r'https?://(?:www\.)?bandlab.com/(?P<url_type>track|post|revision)/(?P<id>[\da-f_-]+)',
r'https?://(?:www\.)?bandlab.com/(?P<url_type>embed)/\?(?:[^#]*&)?id=(?P<id>[\da-f-]+)',
]
_EMBED_REGEX = [rf'<iframe[^>]+src=[\'"](?P<url>{_VALID_URL[1]})[\'"]']
_TESTS = [{
'url': 'https://www.bandlab.com/track/04b37e88dba24967b9dac8eb8567ff39_07d7f906fc96ee11b75e000d3a428fff',
'md5': '46f7b43367dd268bbcf0bbe466753b2c',
'info_dict': {
'id': '02d7f906-fc96-ee11-b75e-000d3a428fff',
'ext': 'm4a',
'uploader_id': 'ender_milze',
'track': 'sweet black',
'description': 'composed by juanjn3737',
'timestamp': 1702171963,
'view_count': int,
'like_count': int,
'duration': 54.629999999999995,
'title': 'sweet black',
'upload_date': '20231210',
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/',
'genres': ['Lofi'],
'uploader': 'ender milze',
'comment_count': int,
'media_type': 'revision',
},
}, {
# Same track as above but post URL
'url': 'https://www.bandlab.com/post/07d7f906-fc96-ee11-b75e-000d3a428fff',
'md5': '46f7b43367dd268bbcf0bbe466753b2c',
'info_dict': {
'id': '02d7f906-fc96-ee11-b75e-000d3a428fff',
'ext': 'm4a',
'uploader_id': 'ender_milze',
'track': 'sweet black',
'description': 'composed by juanjn3737',
'timestamp': 1702171973,
'view_count': int,
'like_count': int,
'duration': 54.629999999999995,
'title': 'sweet black',
'upload_date': '20231210',
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/',
'genres': ['Lofi'],
'uploader': 'ender milze',
'comment_count': int,
'media_type': 'revision',
},
}, {
# SharedKey Example
'url': 'https://www.bandlab.com/track/048916c2-c6da-ee11-85f9-6045bd2e11f9?sharedKey=0NNWX8qYAEmI38lWAzCNDA',
'md5': '15174b57c44440e2a2008be9cae00250',
'info_dict': {
'id': '038916c2-c6da-ee11-85f9-6045bd2e11f9',
'ext': 'm4a',
'comment_count': int,
'genres': ['Other'],
'uploader_id': 'user8353034818103753',
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/51b18363-da23-4b9b-a29c-2933a3e561ca/',
'timestamp': 1709625771,
'track': 'PodcastMaerchen4b',
'duration': 468.14,
'view_count': int,
'description': 'Podcast: Neues aus der Märchenwelt',
'like_count': int,
'upload_date': '20240305',
'uploader': 'Erna Wageneder',
'title': 'PodcastMaerchen4b',
'media_type': 'revision',
},
}, {
# Different Revision selected
'url': 'https://www.bandlab.com/track/130343fc-148b-ea11-96d2-0003ffd1fc09?revId=110343fc-148b-ea11-96d2-0003ffd1fc09',
'md5': '74e055ef9325d63f37088772fbfe4454',
'info_dict': {
'id': '110343fc-148b-ea11-96d2-0003ffd1fc09',
'ext': 'm4a',
'timestamp': 1588273294,
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/b612e533-e4f7-4542-9f50-3fcfd8dd822c/',
'description': 'Final Revision.',
'title': 'Replay ( Instrumental)',
'uploader': 'David R Sparks',
'uploader_id': 'davesnothome69',
'view_count': int,
'comment_count': int,
'track': 'Replay ( Instrumental)',
'genres': ['Rock'],
'upload_date': '20200430',
'like_count': int,
'duration': 279.43,
'media_type': 'revision',
},
}, {
# Video
'url': 'https://www.bandlab.com/post/5cdf9036-3857-ef11-991a-6045bd36e0d9',
'md5': '8caa2ef28e86c1dacf167293cfdbeba9',
'info_dict': {
'id': '5cdf9036-3857-ef11-991a-6045bd36e0d9',
'ext': 'mp4',
'duration': 44.705,
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/videos/67c6cef1-cef6-40d3-831e-a55bc1dcb972/',
'comment_count': int,
'title': 'backing vocals',
'uploader_id': 'marliashya',
'uploader': 'auraa',
'like_count': int,
'description': 'backing vocals',
'media_type': 'video',
},
}, {
# Embed Example
'url': 'https://www.bandlab.com/embed/?blur=false&id=014de0a4-7d82-ea11-a94c-0003ffd19c0f',
'md5': 'a4ad05cb68c54faaed9b0a8453a8cf4a',
'info_dict': {
'id': '014de0a4-7d82-ea11-a94c-0003ffd19c0f',
'ext': 'm4a',
'comment_count': int,
'genres': ['Electronic'],
'uploader': 'Charlie Henson',
'timestamp': 1587328674,
'upload_date': '20200419',
'view_count': int,
'track': 'Positronic Meltdown',
'duration': 318.55,
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/87165bc3-5439-496e-b1f7-a9f13b541ff2/',
'description': 'Checkout my tracks at AOMX http://aomxsounds.com/',
'uploader_id': 'microfreaks',
'title': 'Positronic Meltdown',
'like_count': int,
'media_type': 'revision',
},
}, {
# Track without revisions available
'url': 'https://www.bandlab.com/track/55767ac51789ea11a94c0003ffd1fc09_2f007b0a37b94ec7a69bc25ae15108a5',
'md5': 'f05d68a3769952c2d9257c473e14c15f',
'info_dict': {
'id': '55767ac51789ea11a94c0003ffd1fc09_2f007b0a37b94ec7a69bc25ae15108a5',
'ext': 'm4a',
'track': 'insame',
'like_count': int,
'duration': 84.03,
'title': 'insame',
'view_count': int,
'comment_count': int,
'uploader': 'Sorakime',
'uploader_id': 'sorakime',
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/572a351a-0f3a-4c6a-ac39-1a5defdeeb1c/',
'timestamp': 1691162128,
'upload_date': '20230804',
'media_type': 'track',
},
}, {
'url': 'https://www.bandlab.com/revision/014de0a4-7d82-ea11-a94c-0003ffd19c0f',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
'url': 'https://phantomluigi.github.io/',
'info_dict': {
'id': 'e14223c3-7871-ef11-bdfd-000d3a980db3',
'ext': 'm4a',
'view_count': int,
'upload_date': '20240913',
'uploader_id': 'phantommusicofficial',
'timestamp': 1726194897,
'uploader': 'Phantom',
'comment_count': int,
'genres': ['Progresive Rock'],
'description': 'md5:a38cd668f7a2843295ef284114f18429',
'duration': 225.23,
'like_count': int,
'title': 'Vermilion Pt. 2 (Cover)',
'track': 'Vermilion Pt. 2 (Cover)',
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/62b10750-7aef-4f42-ad08-1af52f577e97/',
'media_type': 'revision',
},
}]
def _real_extract(self, url):
display_id, url_type = self._match_valid_url(url).group('id', 'url_type')
qs = parse_qs(url)
revision_id = traverse_obj(qs, (('revId', 'id'), 0, any))
if url_type == 'revision':
revision_id = display_id
revision_data = None
if not revision_id:
post_data = self._call_api(
'posts', display_id, note='Downloading post data',
query=traverse_obj(qs, {'sharedKey': ('sharedKey', 0)}))
revision_id = traverse_obj(post_data, (('revisionId', ('revision', 'id')), {str}, any))
revision_data = traverse_obj(post_data, ('revision', {dict}))
if not revision_data and not revision_id:
post_type = post_data.get('type')
if post_type == 'Video':
return self._parse_video(post_data, url=url)
if post_type == 'Track':
return self._parse_track(post_data, url=url)
raise ExtractorError(f'Could not extract data for post type {post_type!r}')
if not revision_data:
revision_data = self._call_api(
'revisions', revision_id, note='Downloading revision data', query={'edit': 'false'})
return self._parse_revision(revision_data, url=url)
class BandlabPlaylistIE(BandlabBaseIE):
_VALID_URL = [
r'https?://(?:www\.)?bandlab.com/(?:[\w]+/)?(?P<type>albums|collections)/(?P<id>[\da-f-]+)',
r'https?://(?:www\.)?bandlab.com/(?P<type>embed)/collection/\?(?:[^#]*&)?id=(?P<id>[\da-f-]+)',
]
_EMBED_REGEX = [rf'<iframe[^>]+src=[\'"](?P<url>{_VALID_URL[1]})[\'"]']
_TESTS = [{
'url': 'https://www.bandlab.com/davesnothome69/albums/89b79ea6-de42-ed11-b495-00224845aac7',
'info_dict': {
'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/69507ff3-579a-45be-afca-9e87eddec944/',
'release_date': '20221003',
'title': 'Remnants',
'album': 'Remnants',
'like_count': int,
'album_type': 'LP',
'description': 'A collection of some feel good, rock hits.',
'comment_count': int,
'view_count': int,
'id': '89b79ea6-de42-ed11-b495-00224845aac7',
'uploader': 'David R Sparks',
'uploader_id': 'davesnothome69',
},
'playlist_count': 10,
}, {
'url': 'https://www.bandlab.com/slytheband/collections/955102d4-1040-ef11-86c3-000d3a42581b',
'info_dict': {
'id': '955102d4-1040-ef11-86c3-000d3a42581b',
'timestamp': 1720762659,
'view_count': int,
'title': 'My Shit 🖤',
'uploader_id': 'slytheband',
'uploader': '𝓢𝓛𝓨',
'upload_date': '20240712',
'like_count': int,
'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/collections/2c64ca12-b180-4b76-8587-7a8da76bddc8/',
},
'playlist_count': 15,
}, {
# Embeds can contain both albums and collections with the same URL pattern. This is an album
'url': 'https://www.bandlab.com/embed/collection/?id=12cc6f7f-951b-ee11-907c-00224844f303',
'info_dict': {
'id': '12cc6f7f-951b-ee11-907c-00224844f303',
'release_date': '20230706',
'description': 'This is a collection of songs I created when I had an Amiga computer.',
'view_count': int,
'title': 'Mark Salud The Amiga Collection',
'uploader_id': 'mssirmooth1962',
'comment_count': int,
'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/d618bd7b-0537-40d5-bdd8-61b066e77d59/',
'like_count': int,
'uploader': 'Mark Salud',
'album': 'Mark Salud The Amiga Collection',
'album_type': 'LP',
},
'playlist_count': 24,
}, {
# Tracks without revision id
'url': 'https://www.bandlab.com/embed/collection/?id=e98aafb5-d932-ee11-b8f0-00224844c719',
'info_dict': {
'like_count': int,
'uploader_id': 'sorakime',
'comment_count': int,
'uploader': 'Sorakime',
'view_count': int,
'description': 'md5:4ec31c568a5f5a5a2b17572ea64c3825',
'release_date': '20230812',
'title': 'Art',
'album': 'Art',
'album_type': 'Album',
'id': 'e98aafb5-d932-ee11-b8f0-00224844c719',
'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/20c890de-e94a-4422-828a-2da6377a13c8/',
},
'playlist_count': 13,
}, {
'url': 'https://www.bandlab.com/albums/89b79ea6-de42-ed11-b495-00224845aac7',
'only_matching': True,
}]
def _entries(self, album_data):
for post in traverse_obj(album_data, ('posts', lambda _, v: v['type'])):
post_type = post['type']
if post_type == 'Revision':
yield self._parse_revision(post.get('revision'))
elif post_type == 'Track':
yield self._parse_track(post)
elif post_type == 'Video':
yield self._parse_video(post)
else:
self.report_warning(f'Skipping unknown post type: "{post_type}"')
def _real_extract(self, url):
playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type')
endpoints = {
'albums': ['albums'],
'collections': ['collections'],
'embed': ['collections', 'albums'],
}.get(playlist_type)
for endpoint in endpoints:
playlist_data = self._call_api(
endpoint, playlist_id, note=f'Downloading {endpoint[:-1]} data',
fatal=False, expected_status=404)
if not playlist_data.get('errorCode'):
playlist_type = endpoint
break
if error_code := playlist_data.get('errorCode'):
raise ExtractorError(f'Could not find playlist data. Error code: "{error_code}"')
return self.playlist_result(
self._entries(playlist_data), playlist_id,
**traverse_obj(playlist_data, {
'title': ('name', {str}),
'description': ('description', {str}),
'uploader': ('creator', 'name', {str}),
'uploader_id': ('creator', 'username', {str}),
'timestamp': ('createdOn', {parse_iso8601}),
'release_date': ('releaseDate', {lambda x: x.replace('-', '')}, filter),
'thumbnail': ('picture', ('original', 'url'), {url_or_none}, any),
'like_count': ('counters', 'likes', {int_or_none}),
'comment_count': ('counters', 'comments', {int_or_none}),
'view_count': ('counters', 'plays', {int_or_none}),
}),
**(traverse_obj(playlist_data, {
'album': ('name', {str}),
'album_type': ('type', {str}),
}) if playlist_type == 'albums' else {}))

View File

@@ -1284,9 +1284,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
'description': ('synopses', ('long', 'medium', 'short'), {str}, filter, any),
'duration': ('versions', 0, 'duration', {int}),
'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('versions', 0, 'availableFrom', {int_or_none(scale=1000)}),
}),
}
@@ -1386,7 +1386,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
'url': ('url', {url_or_none}),
'ext': ('format', {str}),
'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
'tbr': ('bitrate', {int_or_none(scale=1000)}),
}))
if formats:
entry = {
@@ -1398,7 +1398,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('firstPublished', {int_or_none(scale=1000)}),
}),
}
done = True
@@ -1428,7 +1428,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
if not entry.get('timestamp'):
entry['timestamp'] = traverse_obj(next_data, (
..., 'contents', is_type('timestamp'), 'model',
'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
'timestamp', {int_or_none(scale=1000)}, any))
entries.append(entry)
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)

View File

@@ -1,18 +1,33 @@
import re
from .common import InfoExtractor
from ..utils import extract_attributes
from ..utils import ExtractorError, extract_attributes
class BFMTVBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/'
_VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html'
_VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block[^"]*"[^>]*>)'
_VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block[^"]*"[^>]*>.*?</div>)'
_VIDEO_ELEMENT_REGEX = r'(<video-js[^>]+>)'
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
def _brightcove_url_result(self, video_id, video_block):
account_id = video_block.get('accountid') or '876450612001'
player_id = video_block.get('playerid') or 'I2qBTln4u'
def _extract_video(self, video_block):
video_element = self._search_regex(
self._VIDEO_ELEMENT_REGEX, video_block, 'video element', default=None)
if video_element:
video_element_attrs = extract_attributes(video_element)
video_id = video_element_attrs.get('data-video-id')
if not video_id:
return
account_id = video_element_attrs.get('data-account') or '876450610001'
player_id = video_element_attrs.get('adjustplayer') or '19dszYXgm'
else:
video_block_attrs = extract_attributes(video_block)
video_id = video_block_attrs.get('videoid')
if not video_id:
return
account_id = video_block_attrs.get('accountid') or '876630703001'
player_id = video_block_attrs.get('playerid') or 'KbPwEbuHx'
return self.url_result(
self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id),
'BrightcoveNew', video_id)
@@ -40,23 +55,25 @@ class BFMTVIE(BFMTVBaseIE):
def _real_extract(self, url):
bfmtv_id = self._match_id(url)
webpage = self._download_webpage(url, bfmtv_id)
video_block = extract_attributes(self._search_regex(
video = self._extract_video(self._search_regex(
self._VIDEO_BLOCK_REGEX, webpage, 'video block'))
return self._brightcove_url_result(video_block['videoid'], video_block)
if not video:
raise ExtractorError('Failed to extract video')
return video
class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE
class BFMTVLiveIE(BFMTVBaseIE):
IE_NAME = 'bfmtv:live'
_VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)'
_TESTS = [{
'url': 'https://www.bfmtv.com/en-direct/',
'info_dict': {
'id': '5615950982001',
'id': '6346069778112',
'ext': 'mp4',
'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'title': r're:^Le Live BFM TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'uploader_id': '876450610001',
'upload_date': '20220926',
'timestamp': 1664207191,
'upload_date': '20240202',
'timestamp': 1706887572,
'live_status': 'is_live',
'thumbnail': r're:https://.+/image\.jpg',
'tags': [],
@@ -69,6 +86,15 @@ class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE
'only_matching': True,
}]
def _real_extract(self, url):
bfmtv_id = self._match_id(url)
webpage = self._download_webpage(url, bfmtv_id)
video = self._extract_video(self._search_regex(
self._VIDEO_BLOCK_REGEX, webpage, 'video block'))
if not video:
raise ExtractorError('Failed to extract video')
return video
class BFMTVArticleIE(BFMTVBaseIE):
IE_NAME = 'bfmtv:article'
@@ -102,18 +128,16 @@ class BFMTVArticleIE(BFMTVBaseIE):
},
}]
def _entries(self, webpage):
for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage):
video = self._extract_video(video_block_el)
if video:
yield video
def _real_extract(self, url):
bfmtv_id = self._match_id(url)
webpage = self._download_webpage(url, bfmtv_id)
entries = []
for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage):
video_block = extract_attributes(video_block_el)
video_id = video_block.get('videoid')
if not video_id:
continue
entries.append(self._brightcove_url_result(video_id, video_block))
return self.playlist_result(
entries, bfmtv_id, self._og_search_title(webpage, fatal=False),
self._entries(webpage), bfmtv_id, self._og_search_title(webpage, fatal=False),
self._html_search_meta(['og:description', 'description'], webpage))

View File

@@ -1,4 +1,3 @@
import functools
from .common import InfoExtractor
from ..utils import (
@@ -50,7 +49,7 @@ class BibelTVBaseIE(InfoExtractor):
**traverse_obj(data, {
'title': 'title',
'description': 'description',
'duration': ('duration', {functools.partial(int_or_none, scale=1000)}),
'duration': ('duration', {int_or_none(scale=1000)}),
'timestamp': ('schedulingStart', {parse_iso8601}),
'season_number': 'seasonNumber',
'episode_number': 'episodeNumber',

View File

@@ -18,7 +18,6 @@ from ..utils import (
InAdvancePagedList,
OnDemandPagedList,
bool_or_none,
clean_html,
determine_ext,
filter_dict,
float_or_none,
@@ -63,7 +62,7 @@ class BilibiliBaseIE(InfoExtractor):
'support_formats', lambda _, v: v['quality'] not in parsed_qualities))], delim=', ')
if missing_formats:
self.to_screen(
f'Format(s) {missing_formats} are missing; you have to login or '
f'Format(s) {missing_formats} are missing; you have to '
f'become a premium member to download them. {self._login_hint()}')
def extract_formats(self, play_info):
@@ -109,7 +108,7 @@ class BilibiliBaseIE(InfoExtractor):
fragments = traverse_obj(play_info, ('durl', lambda _, v: url_or_none(v['url']), {
'url': ('url', {url_or_none}),
'duration': ('length', {functools.partial(float_or_none, scale=1000)}),
'duration': ('length', {float_or_none(scale=1000)}),
'filesize': ('size', {int_or_none}),
}))
if fragments:
@@ -124,7 +123,7 @@ class BilibiliBaseIE(InfoExtractor):
'quality': ('quality', {int_or_none}),
'format_id': ('quality', {str_or_none}),
'format_note': ('quality', {lambda x: format_names.get(x)}),
'duration': ('timelength', {functools.partial(float_or_none, scale=1000)}),
'duration': ('timelength', {float_or_none(scale=1000)}),
}),
**parse_resolution(format_names.get(play_info.get('quality'))),
})
@@ -165,14 +164,18 @@ class BilibiliBaseIE(InfoExtractor):
params['w_rid'] = hashlib.md5(f'{query}{self._get_wbi_key(video_id)}'.encode()).hexdigest()
return params
def _download_playinfo(self, bvid, cid, headers=None, qn=None):
params = {'bvid': bvid, 'cid': cid, 'fnval': 4048}
if qn:
params['qn'] = qn
def _download_playinfo(self, bvid, cid, headers=None, query=None):
params = {'bvid': bvid, 'cid': cid, 'fnval': 4048, **(query or {})}
if self.is_logged_in:
params.pop('try_look', None)
if qn := params.get('qn'):
note = f'Downloading video format {qn} for cid {cid}'
else:
note = f'Downloading video formats for cid {cid}'
return self._download_json(
'https://api.bilibili.com/x/player/wbi/playurl', bvid,
query=self._sign_wbi(params, bvid), headers=headers,
note=f'Downloading video formats for cid {cid} {qn or ""}')['data']
query=self._sign_wbi(params, bvid), headers=headers, note=note)['data']
def json2srt(self, json_data):
srt_data = ''
@@ -191,7 +194,7 @@ class BilibiliBaseIE(InfoExtractor):
}
video_info = self._download_json(
'https://api.bilibili.com/x/player/v2', video_id,
'https://api.bilibili.com/x/player/wbi/v2', video_id,
query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid},
note=f'Extracting subtitle info {cid}', headers=self._HEADERS)
if traverse_obj(video_info, ('data', 'need_login_subtitle')):
@@ -207,7 +210,7 @@ class BilibiliBaseIE(InfoExtractor):
def _get_chapters(self, aid, cid):
chapters = aid and cid and self._download_json(
'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid},
'https://api.bilibili.com/x/player/wbi/v2', aid, query={'aid': aid, 'cid': cid},
note='Extracting chapters', fatal=False, headers=self._HEADERS)
return traverse_obj(chapters, ('data', 'view_points', ..., {
'title': 'content',
@@ -286,7 +289,7 @@ class BilibiliBaseIE(InfoExtractor):
('data', 'interaction', 'graph_version', {int_or_none}))
cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1)
for cid, edges in cid_edges.items():
play_info = self._download_playinfo(video_id, cid, headers=headers)
play_info = self._download_playinfo(video_id, cid, headers=headers, query={'try_look': 1})
yield {
**metainfo,
'id': f'{video_id}_{cid}',
@@ -639,40 +642,29 @@ class BiliBiliIE(BilibiliBaseIE):
headers['Referer'] = url
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
if traverse_obj(initial_state, ('error', 'trueCode')) == -403:
self.raise_login_required()
if traverse_obj(initial_state, ('error', 'trueCode')) == -404:
raise ExtractorError(
'This video may be deleted or geo-restricted. '
'You might want to try a VPN or a proxy server (with --proxy)', expected=True)
is_festival = 'videoData' not in initial_state
if is_festival:
video_data = initial_state['videoInfo']
else:
play_info_obj = self._search_json(
r'window\.__playinfo__\s*=', webpage, 'play info', video_id, fatal=False)
if not play_info_obj:
if traverse_obj(initial_state, ('error', 'trueCode')) == -403:
self.raise_login_required()
if traverse_obj(initial_state, ('error', 'trueCode')) == -404:
raise ExtractorError(
'This video may be deleted or geo-restricted. '
'You might want to try a VPN or a proxy server (with --proxy)', expected=True)
play_info = traverse_obj(play_info_obj, ('data', {dict}))
if not play_info:
if traverse_obj(play_info_obj, 'code') == 87007:
toast = get_element_by_class('tips-toast', webpage) or ''
msg = clean_html(
f'{get_element_by_class("belongs-to", toast) or ""}'
+ (get_element_by_class('level', toast) or ''))
raise ExtractorError(
f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True)
raise ExtractorError('Failed to extract play info')
video_data = initial_state['videoData']
video_id, title = video_data['bvid'], video_data.get('title')
# Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
page_list_json = not is_festival and traverse_obj(
page_list_json = (not is_festival and traverse_obj(
self._download_json(
'https://api.bilibili.com/x/player/pagelist', video_id,
fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
note='Extracting videos in anthology', headers=headers),
'data', expected_type=list) or []
'data', expected_type=list)) or []
is_anthology = len(page_list_json) > 1
part_id = int_or_none(parse_qs(url).get('p', [None])[-1])
@@ -691,8 +683,6 @@ class BiliBiliIE(BilibiliBaseIE):
festival_info = {}
if is_festival:
play_info = self._download_playinfo(video_id, cid, headers=headers)
festival_info = traverse_obj(initial_state, {
'uploader': ('videoInfo', 'upName'),
'uploader_id': ('videoInfo', 'upMid', {str_or_none}),
@@ -727,62 +717,79 @@ class BiliBiliIE(BilibiliBaseIE):
self._get_interactive_entries(video_id, cid, metainfo, headers=headers), **metainfo,
duration=traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})),
__post_extractor=self.extract_comments(aid))
else:
formats = self.extract_formats(play_info)
if not traverse_obj(play_info, ('dash')):
# we only have legacy formats and need additional work
has_qn = lambda x: x in traverse_obj(formats, (..., 'quality'))
for qn in traverse_obj(play_info, ('accept_quality', lambda _, v: not has_qn(v), {int})):
formats.extend(traverse_obj(
self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, qn=qn)),
lambda _, v: not has_qn(v['quality'])))
self._check_missing_formats(play_info, formats)
flv_formats = traverse_obj(formats, lambda _, v: v['fragments'])
if flv_formats and len(flv_formats) < len(formats):
# Flv and mp4 are incompatible due to `multi_video` workaround, so drop one
if not self._configuration_arg('prefer_multi_flv'):
dropped_fmts = ', '.join(
f'{f.get("format_note")} ({f.get("format_id")})' for f in flv_formats)
formats = traverse_obj(formats, lambda _, v: not v.get('fragments'))
if dropped_fmts:
self.to_screen(
f'Dropping incompatible flv format(s) {dropped_fmts} since mp4 is available. '
'To extract flv, pass --extractor-args "bilibili:prefer_multi_flv"')
else:
formats = traverse_obj(
# XXX: Filtering by extractor-arg is for testing purposes
formats, lambda _, v: v['quality'] == int(self._configuration_arg('prefer_multi_flv')[0]),
) or [max(flv_formats, key=lambda x: x['quality'])]
play_info = None
if self.is_logged_in:
play_info = traverse_obj(
self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id, default=None),
('data', {dict}))
if not play_info:
play_info = self._download_playinfo(video_id, cid, headers=headers, query={'try_look': 1})
formats = self.extract_formats(play_info)
if traverse_obj(formats, (0, 'fragments')):
# We have flv formats, which are individual short videos with their own timestamps and metainfo
# Binary concatenation corrupts their timestamps, so we need a `multi_video` workaround
return {
**metainfo,
'_type': 'multi_video',
'entries': [{
'id': f'{metainfo["id"]}_{idx}',
'title': metainfo['title'],
'http_headers': metainfo['http_headers'],
'formats': [{
**fragment,
'format_id': formats[0].get('format_id'),
}],
'subtitles': self.extract_subtitles(video_id, cid) if idx == 0 else None,
'__post_extractor': self.extract_comments(aid) if idx == 0 else None,
} for idx, fragment in enumerate(formats[0]['fragments'])],
'duration': float_or_none(play_info.get('timelength'), scale=1000),
}
else:
return {
**metainfo,
'formats': formats,
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'chapters': self._get_chapters(aid, cid),
'subtitles': self.extract_subtitles(video_id, cid),
'__post_extractor': self.extract_comments(aid),
}
if video_data.get('is_upower_exclusive'):
high_level = traverse_obj(initial_state, ('elecFullInfo', 'show_info', 'high_level', {dict})) or {}
msg = f'{join_nonempty("title", "sub_title", from_dict=high_level, delim="")}. {self._login_hint()}'
if not formats:
raise ExtractorError(f'This is a supporter-only video: {msg}', expected=True)
if '试看' in traverse_obj(play_info, ('accept_description', ..., {str})):
self.report_warning(
f'This is a supporter-only video, only the preview will be extracted: {msg}',
video_id=video_id)
if not traverse_obj(play_info, 'dash'):
# we only have legacy formats and need additional work
has_qn = lambda x: x in traverse_obj(formats, (..., 'quality'))
for qn in traverse_obj(play_info, ('accept_quality', lambda _, v: not has_qn(v), {int})):
formats.extend(traverse_obj(
self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, query={'qn': qn})),
lambda _, v: not has_qn(v['quality'])))
self._check_missing_formats(play_info, formats)
flv_formats = traverse_obj(formats, lambda _, v: v['fragments'])
if flv_formats and len(flv_formats) < len(formats):
# Flv and mp4 are incompatible due to `multi_video` workaround, so drop one
if not self._configuration_arg('prefer_multi_flv'):
dropped_fmts = ', '.join(
f'{f.get("format_note")} ({f.get("format_id")})' for f in flv_formats)
formats = traverse_obj(formats, lambda _, v: not v.get('fragments'))
if dropped_fmts:
self.to_screen(
f'Dropping incompatible flv format(s) {dropped_fmts} since mp4 is available. '
'To extract flv, pass --extractor-args "bilibili:prefer_multi_flv"')
else:
formats = traverse_obj(
# XXX: Filtering by extractor-arg is for testing purposes
formats, lambda _, v: v['quality'] == int(self._configuration_arg('prefer_multi_flv')[0]),
) or [max(flv_formats, key=lambda x: x['quality'])]
if traverse_obj(formats, (0, 'fragments')):
# We have flv formats, which are individual short videos with their own timestamps and metainfo
# Binary concatenation corrupts their timestamps, so we need a `multi_video` workaround
return {
**metainfo,
'_type': 'multi_video',
'entries': [{
'id': f'{metainfo["id"]}_{idx}',
'title': metainfo['title'],
'http_headers': metainfo['http_headers'],
'formats': [{
**fragment,
'format_id': formats[0].get('format_id'),
}],
'subtitles': self.extract_subtitles(video_id, cid) if idx == 0 else None,
'__post_extractor': self.extract_comments(aid) if idx == 0 else None,
} for idx, fragment in enumerate(formats[0]['fragments'])],
'duration': float_or_none(play_info.get('timelength'), scale=1000),
}
return {
**metainfo,
'formats': formats,
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'chapters': self._get_chapters(aid, cid),
'subtitles': self.extract_subtitles(video_id, cid),
'__post_extractor': self.extract_comments(aid),
}
class BiliBiliBangumiIE(BilibiliBaseIE):
@@ -860,10 +867,16 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
self.raise_login_required('This video is for premium members only')
headers['Referer'] = url
play_info = self._download_json(
'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id,
'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
headers=headers)
play_info = (
self._search_json(
r'playurlSSRData\s*=', webpage, 'embedded page info', episode_id,
end_pattern='\n', default=None)
or self._download_json(
'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id,
'Extracting episode', query={'fnval': 12240, 'ep_id': episode_id},
headers=headers))
premium_only = play_info.get('code') == -10403
play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {}
@@ -1585,7 +1598,7 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
'title': ('title', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}, {lambda x: x or None}),
'timestamp': ('ctime', {int_or_none}, filter),
'thumbnail': ('cover', {url_or_none}),
})),
}

388
yt_dlp/extractor/bluesky.py Normal file
View File

@@ -0,0 +1,388 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
format_field,
int_or_none,
mimetype2ext,
orderedSet,
parse_iso8601,
truncate_string,
update_url_query,
url_basename,
url_or_none,
variadic,
)
from ..utils.traversal import traverse_obj
class BlueskyIE(InfoExtractor):
_VALID_URL = [
r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P<handle>[\w.:%-]+)/post/(?P<id>\w+)',
r'at://(?P<handle>[\w.:%-]+)/app\.bsky\.feed\.post/(?P<id>\w+)',
]
_TESTS = [{
'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g',
'md5': '375539c1930ab05d15585ed772ab54fd',
'info_dict': {
'id': '3l4omssdl632g',
'ext': 'mp4',
'uploader': 'Blu3Blu3Lilith',
'uploader_id': 'blu3blue.bsky.social',
'uploader_url': 'https://bsky.app/profile/blu3blue.bsky.social',
'channel_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2',
'channel_url': 'https://bsky.app/profile/did:plc:pzdr5ylumf7vmvwasrpr5bf2',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'OMG WE HAVE VIDEOS NOW',
'description': 'OMG WE HAVE VIDEOS NOW',
'upload_date': '20240921',
'timestamp': 1726940605,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g',
'md5': 'b9e344fdbce9f2852c668a97efefb105',
'info_dict': {
'id': '3l3vgf77uco2g',
'ext': 'mp4',
'uploader': 'Bluesky',
'uploader_id': 'bsky.app',
'uploader_url': 'https://bsky.app/profile/bsky.app',
'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky now has video! Update your app to versi...',
'alt_title': 'Bluesky video feature announcement',
'description': r're:(?s)Bluesky now has video! .{239}',
'upload_date': '20240911',
'timestamp': 1726074716,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
'subtitles': {
'en': 'mincount:1',
},
},
}, {
'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c',
'md5': '5f2df8c200b5633eb7fb2c984d29772f',
'info_dict': {
'id': '3l4qhp7bcs52c',
'ext': 'mp4',
'uploader': 'souris',
'uploader_id': 'souris.moe',
'uploader_url': 'https://bsky.app/profile/souris.moe',
'channel_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp',
'channel_url': 'https://bsky.app/profile/did:plc:tj7g244gl5v6ai6cm4f4wlqp',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky video #3l4qhp7bcs52c',
'upload_date': '20240922',
'timestamp': 1727003838,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e',
'md5': '1af9c7fda061cf7593bbffca89e43d1c',
'info_dict': {
'id': '3l3w4tnezek2e',
'ext': 'mp4',
'uploader': 'clean',
'uploader_id': 'de1.pds.tentacle.expert',
'uploader_url': 'https://bsky.app/profile/de1.pds.tentacle.expert',
'channel_id': 'did:web:de1.tentacle.expert',
'channel_url': 'https://bsky.app/profile/did:web:de1.tentacle.expert',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky video #3l3w4tnezek2e',
'upload_date': '20240911',
'timestamp': 1726098823,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o',
'info_dict': {
'id': 'XxK3t_5V3ao',
'ext': 'mp4',
'uploader': 'yunayu',
'uploader_id': '@yunayuispink',
'uploader_url': 'https://www.youtube.com/@yunayuispink',
'channel': 'yunayu',
'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w',
'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w',
'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp',
'description': r're:Have a good goodx10000day',
'title': '5min vs 5hours drawing',
'availability': 'public',
'live_status': 'not_live',
'playable_in_embed': True,
'upload_date': '20241026',
'timestamp': 1729967784,
'duration': 321,
'age_limit': 0,
'like_count': int,
'view_count': int,
'comment_count': int,
'channel_follower_count': int,
'categories': ['Entertainment'],
'tags': [],
},
'add_ie': ['Youtube'],
}, {
'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m',
'info_dict': {
'id': '222792849',
'ext': 'mp3',
'uploader': 'LASERBAT',
'uploader_id': 'laserbatx',
'uploader_url': 'https://laserbatx.bandcamp.com',
'artists': ['LASERBAT'],
'album_artists': ['LASERBAT'],
'album': 'Hari Nezumi [EP]',
'track': 'Forward to the End',
'title': 'LASERBAT - Forward to the End',
'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg',
'duration': 228.571,
'track_id': '222792849',
'release_date': '20230423',
'upload_date': '20230423',
'timestamp': 1682276040.0,
'release_timestamp': 1682276040.0,
'track_number': 1,
},
'add_ie': ['Bandcamp'],
}, {
'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j',
'md5': 'b9e344fdbce9f2852c668a97efefb105',
'info_dict': {
'id': '3l3vgf77uco2g',
'ext': 'mp4',
'uploader': 'Bluesky',
'uploader_id': 'bsky.app',
'uploader_url': 'https://bsky.app/profile/bsky.app',
'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky now has video! Update your app to versi...',
'alt_title': 'Bluesky video feature announcement',
'description': r're:(?s)Bluesky now has video! .{239}',
'upload_date': '20240911',
'timestamp': 1726074716,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
'subtitles': {
'en': 'mincount:1',
},
},
}, {
'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f',
'md5': '8775118b235cf9fa6b5ad30f95cda75c',
'info_dict': {
'id': '3l7rdfxhyds2f',
'ext': 'mp4',
'uploader': 'cinnamon',
'uploader_id': 'alt.bun.how',
'uploader_url': 'https://bsky.app/profile/alt.bun.how',
'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide',
'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'crazy that i look like this tbh',
'description': 'crazy that i look like this tbh',
'upload_date': '20241030',
'timestamp': 1730332128,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': ['sexual'],
'age_limit': 18,
},
}, {
'url': 'at://did:plc:ia76kvnndjutgedggx2ibrem/app.bsky.feed.post/3l6zrz6zyl2dr',
'md5': '71b0eb6d85d03145e6af6642c7fc6d78',
'info_dict': {
'id': '3l6zrz6zyl2dr',
'ext': 'mp4',
'uploader': 'mary🐇',
'uploader_id': 'mary.my.id',
'uploader_url': 'https://bsky.app/profile/mary.my.id',
'channel_id': 'did:plc:ia76kvnndjutgedggx2ibrem',
'channel_url': 'https://bsky.app/profile/did:plc:ia76kvnndjutgedggx2ibrem',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky video #3l6zrz6zyl2dr',
'upload_date': '20241021',
'timestamp': 1729523172,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/purpleicetea.bsky.social/post/3l7gv55dc2o2w',
'info_dict': {
'id': '3l7gv55dc2o2w',
},
'playlist': [{
'info_dict': {
'id': '3l7gv55dc2o2w',
'ext': 'mp4',
'upload_date': '20241026',
'description': 'One of my favorite videos',
'comment_count': int,
'uploader_url': 'https://bsky.app/profile/purpleicetea.bsky.social',
'uploader': 'Purple.Ice.Tea',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'channel_url': 'https://bsky.app/profile/did:plc:bjh5ffwya5f53dfy47dezuwx',
'like_count': int,
'channel_id': 'did:plc:bjh5ffwya5f53dfy47dezuwx',
'repost_count': int,
'timestamp': 1729973202,
'tags': [],
'uploader_id': 'purpleicetea.bsky.social',
'title': 'One of my favorite videos',
},
}, {
'info_dict': {
'id': '3l77u64l7le2e',
'ext': 'mp4',
'title': 'hearing people on twitter say that bluesky isn\'...',
'like_count': int,
'uploader_id': 'thafnine.net',
'uploader_url': 'https://bsky.app/profile/thafnine.net',
'upload_date': '20241024',
'channel_url': 'https://bsky.app/profile/did:plc:6ttyq36rhiyed7wu3ws7dmqj',
'description': r're:(?s)hearing people on twitter say that bluesky .{93}',
'tags': [],
'alt_title': 'md5:9b1ee1937fb3d1a81e932f9ec14d560e',
'uploader': 'T9',
'channel_id': 'did:plc:6ttyq36rhiyed7wu3ws7dmqj',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'timestamp': 1729731642,
'comment_count': int,
'repost_count': int,
},
}],
}]
_BLOB_URL_TMPL = '{}/xrpc/com.atproto.sync.getBlob'
def _get_service_endpoint(self, did, video_id):
if did.startswith('did:web:'):
url = f'https://{did[8:]}/.well-known/did.json'
else:
url = f'https://plc.directory/{did}'
services = self._download_json(
url, video_id, 'Fetching service endpoint', 'Falling back to bsky.social', fatal=False)
return traverse_obj(
services, ('service', lambda _, x: x['type'] == 'AtprotoPersonalDataServer',
'serviceEndpoint', {url_or_none}, any)) or 'https://bsky.social'
def _real_extract(self, url):
handle, video_id = self._match_valid_url(url).group('handle', 'id')
post = self._download_json(
'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
video_id, query={
'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
'depth': 0,
'parentHeight': 0,
})['thread']['post']
entries = []
# app.bsky.embed.video.view/app.bsky.embed.external.view
entries.extend(self._extract_videos(post, video_id))
# app.bsky.embed.recordWithMedia.view
entries.extend(self._extract_videos(
post, video_id, embed_path=('embed', 'media'), record_subpath=('embed', 'media')))
# app.bsky.embed.record.view
if nested_post := traverse_obj(post, ('embed', 'record', ('record', None), {dict}, any)):
entries.extend(self._extract_videos(
nested_post, video_id, embed_path=('embeds', 0), record_path='value'))
if not entries:
raise ExtractorError('No video could be found in this post', expected=True)
if len(entries) == 1:
return entries[0]
return self.playlist_result(entries, video_id)
@staticmethod
def _build_profile_url(path):
return format_field(path, None, 'https://bsky.app/profile/%s', default=None)
def _extract_videos(self, root, video_id, embed_path='embed', record_path='record', record_subpath='embed'):
embed_path = variadic(embed_path, (str, bytes, dict, set))
record_path = variadic(record_path, (str, bytes, dict, set))
record_subpath = variadic(record_subpath, (str, bytes, dict, set))
entries = []
if external_uri := traverse_obj(root, (
((*record_path, *record_subpath), embed_path), 'external', 'uri', {url_or_none}, any)):
entries.append(self.url_result(external_uri))
if playlist := traverse_obj(root, (*embed_path, 'playlist', {url_or_none})):
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
playlist, video_id, 'mp4', m3u8_id='hls', fatal=False)
else:
return entries
video_cid = traverse_obj(
root, (*embed_path, 'cid', {str}),
(*record_path, *record_subpath, 'video', 'ref', '$link', {str}))
did = traverse_obj(root, ('author', 'did', {str}))
if did and video_cid:
endpoint = self._get_service_endpoint(did, video_id)
formats.append({
'format_id': 'blob',
'url': update_url_query(
self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': video_cid}),
**traverse_obj(root, (*embed_path, 'aspectRatio', {
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
})),
**traverse_obj(root, (*record_path, *record_subpath, 'video', {
'filesize': ('size', {int_or_none}),
'ext': ('mimeType', {mimetype2ext}),
})),
})
for sub_data in traverse_obj(root, (
*record_path, *record_subpath, 'captions', lambda _, v: v['file']['ref']['$link'])):
subtitles.setdefault(sub_data.get('lang') or 'und', []).append({
'url': update_url_query(
self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': sub_data['file']['ref']['$link']}),
'ext': traverse_obj(sub_data, ('file', 'mimeType', {mimetype2ext})),
})
entries.append({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(root, {
'id': ('uri', {url_basename}),
'thumbnail': (*embed_path, 'thumbnail', {url_or_none}),
'alt_title': (*embed_path, 'alt', {str}, filter),
'uploader': ('author', 'displayName', {str}),
'uploader_id': ('author', 'handle', {str}),
'uploader_url': ('author', 'handle', {self._build_profile_url}),
'channel_id': ('author', 'did', {str}),
'channel_url': ('author', 'did', {self._build_profile_url}),
'like_count': ('likeCount', {int_or_none}),
'repost_count': ('repostCount', {int_or_none}),
'comment_count': ('replyCount', {int_or_none}),
'timestamp': ('indexedAt', {parse_iso8601}),
'tags': ('labels', ..., 'val', {str}, all, {orderedSet}),
'age_limit': (
'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any),
'description': (*record_path, 'text', {str}, filter),
'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}),
}),
})
return entries

View File

@@ -1,35 +1,20 @@
import functools
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_text_and_html_by_tag,
get_elements_by_class,
join_nonempty,
js_to_json,
mimetype2ext,
unified_strdate,
url_or_none,
urljoin,
variadic,
)
from ..utils.traversal import traverse_obj
def html_get_element(tag=None, cls=None):
assert tag or cls, 'One of tag or class is required'
if cls:
func = functools.partial(get_elements_by_class, cls, tag=tag)
else:
func = functools.partial(get_element_text_and_html_by_tag, tag)
def html_get_element_wrapper(html):
return variadic(func(html))[0]
return html_get_element_wrapper
from ..utils.traversal import (
find_element,
traverse_obj,
)
class BpbIE(InfoExtractor):
@@ -41,12 +26,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '297',
'ext': 'mp4',
'creator': 'Kooperative Berlin',
'description': 'md5:f4f75885ba009d3e2b156247a8941ce6',
'release_date': '20160115',
'creators': ['Kooperative Berlin'],
'description': r're:Joachim Gauck, .*\n\nKamera: .*',
'release_date': '20150716',
'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D',
'tags': [],
'thumbnail': r're:https?://www\.bpb\.de/cache/images/7/297_teaser_16x9_1240\.jpg.*',
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -55,11 +40,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '522184',
'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
'release_date': '20230621',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB',
'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
'tags': [],
'thumbnail': r're:https://www\.bpb\.de/cache/images/4/522184_teaser_16x9_1240\.png.*',
'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -68,11 +54,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '518789',
'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
'release_date': '20230302',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D',
'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
'tags': [],
'thumbnail': r're:https://www\.bpb\.de/cache/images/9/518789_teaser_16x9_1240\.jpeg.*',
'title': 'md5:3e956f264bb501f6383f10495a401da4',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -84,12 +71,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '315813',
'ext': 'mp3',
'creator': 'Axel Schröder',
'creators': ['Axel Schröder'],
'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
'release_date': '20200921',
'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94',
'thumbnail': r're:https://www\.bpb\.de/cache/images/3/315813_teaser_16x9_1240\.png.*',
'title': 'Folge 1: Eine Einführung',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -98,12 +85,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '517806',
'ext': 'mp3',
'creator': 'Bundeszentrale für politische Bildung',
'creators': ['Bundeszentrale für politische Bildung'],
'description': 'md5:594689600e919912aade0b2871cc3fed',
'release_date': '20230127',
'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0',
'thumbnail': r're:https://www\.bpb\.de/cache/images/6/517806_teaser_16x9_1240\.png.*',
'title': 'Die Weltanschauung der "Neuen Rechten"',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -147,7 +134,7 @@ class BpbIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
title_result = traverse_obj(webpage, ({find_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))
return {
@@ -156,15 +143,15 @@ class BpbIE(InfoExtractor):
# This metadata could be interpreted otherwise, but it fits "series" the most
'series': traverse_obj(title_result, ('series', {str.strip})) or None,
'description': join_nonempty(*traverse_obj(webpage, [(
{html_get_element(cls='opening-intro')},
[{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}],
{find_element(cls='opening-intro')},
[{find_element(tag='bpb-accordion-item')}, {find_element(cls='text-content')}],
), {clean_html}]), delim='\n\n') or None,
'creator': self._html_search_meta('author', webpage),
'creators': traverse_obj(self._html_search_meta('author', webpage), all),
'uploader': self._html_search_meta('publisher', webpage),
'release_date': unified_strdate(self._html_search_meta('date', webpage)),
'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),
**traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), {
'formats': (':sources', ..., {self._process_source}),
'thumbnail': ('poster', {lambda x: urljoin(url, x)}),
'thumbnail': ('poster', {urljoin(url)}),
}),
}

View File

@@ -145,10 +145,9 @@ class BravoTVIE(AdobePassIE):
tp_metadata = self._download_json(
update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False)
seconds_or_none = lambda x: float_or_none(x, 1000)
chapters = traverse_obj(tp_metadata, ('chapters', ..., {
'start_time': ('startTime', {seconds_or_none}),
'end_time': ('endTime', {seconds_or_none}),
'start_time': ('startTime', {float_or_none(scale=1000)}),
'end_time': ('endTime', {float_or_none(scale=1000)}),
}))
# prune pointless single chapters that span the entire duration from short videos
if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')):
@@ -168,8 +167,8 @@ class BravoTVIE(AdobePassIE):
**merge_dicts(traverse_obj(tp_metadata, {
'title': 'title',
'description': 'description',
'duration': ('duration', {seconds_or_none}),
'timestamp': ('pubDate', {seconds_or_none}),
'duration': ('duration', {float_or_none(scale=1000)}),
'timestamp': ('pubDate', {float_or_none(scale=1000)}),
'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}),
'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}),
'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}),

View File

@@ -31,6 +31,7 @@ from ..utils import (
update_url_query,
url_or_none,
)
from ..utils.traversal import traverse_obj
class BrightcoveLegacyIE(InfoExtractor):
@@ -935,8 +936,8 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
if content_type == 'playlist':
return self.playlist_result(
[self._parse_brightcove_metadata(vid, vid.get('id'), headers)
for vid in json_data.get('videos', []) if vid.get('id')],
(self._parse_brightcove_metadata(vid, vid['id'], headers)
for vid in traverse_obj(json_data, ('videos', lambda _, v: v['id']))),
json_data.get('id'), json_data.get('name'),
json_data.get('description'))

View File

@@ -8,11 +8,13 @@ from ..utils import (
bug_reports_message,
clean_html,
format_field,
get_element_text_and_html_by_tag,
int_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
from ..utils.traversal import (
find_element,
traverse_obj,
)
class BundestagIE(InfoExtractor):
@@ -115,9 +117,8 @@ class BundestagIE(InfoExtractor):
note='Downloading metadata overlay', fatal=False,
), {
'title': (
{functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0,
{functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
{find_element(tag='h3')}, {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
'description': ({find_element(tag='p')}, {clean_html}),
}))
return result

View File

@@ -53,7 +53,7 @@ class CaffeineTVIE(InfoExtractor):
'like_count': ('like_count', {int_or_none}),
'view_count': ('view_count', {int_or_none}),
'comment_count': ('comment_count', {int_or_none}),
'tags': ('tags', ..., {str}, {lambda x: x or None}),
'tags': ('tags', ..., {str}, filter),
'uploader': ('user', 'name', {str}),
'uploader_id': (((None, 'user'), 'username'), {str}, any),
'is_live': ('is_live', {bool}),
@@ -62,7 +62,7 @@ class CaffeineTVIE(InfoExtractor):
'title': ('broadcast_title', {str}),
'duration': ('content_duration', {int_or_none}),
'timestamp': ('broadcast_start_time', {parse_iso8601}),
'thumbnail': ('preview_image_path', {lambda x: urljoin(url, x)}),
'thumbnail': ('preview_image_path', {urljoin(url)}),
}),
'age_limit': {
# assume Apple Store ratings: https://en.wikipedia.org/wiki/Mobile_software_content_rating_system

View File

@@ -4,7 +4,6 @@ import json
import re
import time
import urllib.parse
import xml.etree.ElementTree
from .common import InfoExtractor
from ..networking import HEADRequest
@@ -12,7 +11,6 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
join_nonempty,
js_to_json,
mimetype2ext,
orderedSet,
@@ -455,8 +453,8 @@ class CBCPlayerIE(InfoExtractor):
chapters = traverse_obj(data, (
'media', 'chapters', lambda _, v: float(v['startTime']) is not None, {
'start_time': ('startTime', {functools.partial(float_or_none, scale=1000)}),
'end_time': ('endTime', {functools.partial(float_or_none, scale=1000)}),
'start_time': ('startTime', {float_or_none(scale=1000)}),
'end_time': ('endTime', {float_or_none(scale=1000)}),
'title': ('name', {str}),
}))
# Filter out pointless single chapters with start_time==0 and no end_time
@@ -467,8 +465,8 @@ class CBCPlayerIE(InfoExtractor):
**traverse_obj(data, {
'title': ('title', {str}),
'description': ('description', {str.strip}),
'thumbnail': ('image', 'url', {url_or_none}, {functools.partial(update_url, query=None)}),
'timestamp': ('publishedAt', {functools.partial(float_or_none, scale=1000)}),
'thumbnail': ('image', 'url', {url_or_none}, {update_url(query=None)}),
'timestamp': ('publishedAt', {float_or_none(scale=1000)}),
'media_type': ('media', 'clipType', {str}),
'series': ('showName', {str}),
'season_number': ('media', 'season', {int_or_none}),
@@ -524,14 +522,13 @@ class CBCGemIE(InfoExtractor):
_TESTS = [{
# This is a normal, public, TV show video
'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01',
'md5': '93dbb31c74a8e45b378cf13bd3f6f11e',
'info_dict': {
'id': 'schitts-creek/s06e01',
'ext': 'mp4',
'title': 'Smoke Signals',
'description': 'md5:929868d20021c924020641769eb3e7f1',
'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_06e01_thumbnail_v01.jpg?im=Resize=(Size)',
'duration': 1314,
'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_06e01_thumbnail_v01\.jpg',
'duration': 1324,
'categories': ['comedy'],
'series': 'Schitt\'s Creek',
'season': 'Season 6',
@@ -539,19 +536,21 @@ class CBCGemIE(InfoExtractor):
'episode': 'Smoke Signals',
'episode_number': 1,
'episode_id': 'schitts-creek/s06e01',
'upload_date': '20210618',
'timestamp': 1623988800,
'release_date': '20200107',
'release_timestamp': 1578427200,
},
'params': {'format': 'bv'},
'skip': 'Geo-restricted to Canada',
}, {
# This video requires an account in the browser, but works fine in yt-dlp
'url': 'https://gem.cbc.ca/media/schitts-creek/s01e01',
'md5': '297a9600f554f2258aed01514226a697',
'info_dict': {
'id': 'schitts-creek/s01e01',
'ext': 'mp4',
'title': 'The Cup Runneth Over',
'description': 'md5:9bca14ea49ab808097530eb05a29e797',
'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_01e01_thumbnail_v01.jpg?im=Resize=(Size)',
'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_01e01_thumbnail_v01\.jpg',
'series': 'Schitt\'s Creek',
'season_number': 1,
'season': 'Season 1',
@@ -560,9 +559,12 @@ class CBCGemIE(InfoExtractor):
'episode_id': 'schitts-creek/s01e01',
'duration': 1309,
'categories': ['comedy'],
'upload_date': '20210617',
'timestamp': 1623902400,
'release_date': '20151124',
'release_timestamp': 1448323200,
},
'params': {'format': 'bv'},
'skip': 'Geo-restricted to Canada',
}, {
'url': 'https://gem.cbc.ca/nadiyas-family-favourites/s01e01',
'only_matching': True,
@@ -631,38 +633,6 @@ class CBCGemIE(InfoExtractor):
return
self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token')
def _find_secret_formats(self, formats, video_id):
""" Find a valid video url and convert it to the secret variant """
base_format = next((f for f in formats if f.get('vcodec') != 'none'), None)
if not base_format:
return
base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url'])
url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url)
secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False)
if not isinstance(secret_xml, xml.etree.ElementTree.Element):
return
for child in secret_xml:
if child.attrib.get('Type') != 'video':
continue
for video_quality in child:
bitrate = int_or_none(video_quality.attrib.get('Bitrate'))
if not bitrate or 'Index' not in video_quality.attrib:
continue
height = int_or_none(video_quality.attrib.get('MaxHeight'))
yield {
**base_format,
'format_id': join_nonempty('sec', height),
# Note: \g<1> is necessary instead of \1 since bitrate is a number
'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url),
'width': int_or_none(video_quality.attrib.get('MaxWidth')),
'tbr': bitrate / 1000.0,
'height': height,
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_info = self._download_json(
@@ -676,7 +646,6 @@ class CBCGemIE(InfoExtractor):
else:
headers = {}
m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers)
m3u8_url = m3u8_info.get('url')
if m3u8_info.get('errorCode') == 1:
self.raise_geo_restricted(countries=['CA'])
@@ -685,9 +654,9 @@ class CBCGemIE(InfoExtractor):
elif m3u8_info.get('errorCode') != 0:
raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}')
formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
formats = self._extract_m3u8_formats(
m3u8_info['url'], video_id, 'mp4', m3u8_id='hls', query={'manifestType': ''})
self._remove_duplicate_formats(formats)
formats.extend(self._find_secret_formats(formats, video_id))
for fmt in formats:
if fmt.get('vcodec') == 'none':
@@ -703,20 +672,21 @@ class CBCGemIE(InfoExtractor):
return {
'id': video_id,
'title': video_info['title'],
'description': video_info.get('description'),
'thumbnail': video_info.get('image'),
'series': video_info.get('series'),
'season_number': video_info.get('season'),
'season': f'Season {video_info.get("season")}',
'episode_number': video_info.get('episode'),
'episode': video_info.get('title'),
'episode_id': video_id,
'duration': video_info.get('duration'),
'categories': [video_info.get('category')],
'formats': formats,
'release_timestamp': video_info.get('airDate'),
'timestamp': video_info.get('availableDate'),
**traverse_obj(video_info, {
'title': ('title', {str}),
'episode': ('title', {str}),
'description': ('description', {str}),
'thumbnail': ('image', {url_or_none}),
'series': ('series', {str}),
'season_number': ('season', {int_or_none}),
'episode_number': ('episode', {int_or_none}),
'duration': ('duration', {int_or_none}),
'categories': ('category', {str}, all),
'release_timestamp': ('airDate', {int_or_none(scale=1000)}),
'timestamp': ('availableDate', {int_or_none(scale=1000)}),
}),
}

View File

@@ -96,7 +96,7 @@ class CBSNewsBaseIE(InfoExtractor):
**traverse_obj(item, {
'title': (None, ('fulltitle', 'title')),
'description': 'dek',
'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}),
'timestamp': ('timestamp', {float_or_none(scale=1000)}),
'duration': ('duration', {float_or_none}),
'subtitles': ('captions', {get_subtitles}),
'thumbnail': ('images', ('hd', 'sd'), {url_or_none}),

View File

@@ -12,53 +12,86 @@ from ..utils import (
class CCMAIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?P<type>video|audio)/(?P<id>\d+)'
IE_DESC = '3Cat, TV3 and Catalunya Ràdio'
_VALID_URL = r'https?://(?:www\.)?3cat\.cat/(?:3cat|tv3/sx3)/[^/?#]+/(?P<type>video|audio)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/',
# ccma.cat/tv3/alacarta/ URLs redirect to 3cat.cat/3cat/
'url': 'https://www.3cat.cat/3cat/lespot-de-la-marato-de-tv3/video/5630208/',
'md5': '7296ca43977c8ea4469e719c609b0871',
'info_dict': {
'id': '5630208',
'ext': 'mp4',
'title': 'L\'espot de La Marató de TV3',
'title': 'L\'espot de La Marató 2016: Ictus i les lesions medul·lars i cerebrals traumàtiques',
'description': 'md5:f12987f320e2f6e988e9908e4fe97765',
'timestamp': 1478608140,
'upload_date': '20161108',
'age_limit': 0,
'alt_title': 'EsportMarató2016WEB_PerPublicar',
'duration': 79,
'thumbnail': 'https://img.3cat.cat/multimedia/jpg/4/6/1478536106664.jpg',
'series': 'Dedicada a l\'ictus i les lesions medul·lars i cerebrals traumàtiques',
'categories': ['Divulgació'],
},
}, {
'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/',
# ccma.cat/catradio/alacarta/ URLs redirect to 3cat.cat/3cat/
'url': 'https://www.3cat.cat/3cat/el-consell-de-savis-analitza-el-derbi/audio/943685/',
'md5': 'fa3e38f269329a278271276330261425',
'info_dict': {
'id': '943685',
'ext': 'mp3',
'title': 'El Consell de Savis analitza el derbi',
'description': 'md5:e2a3648145f3241cb9c6b4b624033e53',
'upload_date': '20170512',
'timestamp': 1494622500,
'upload_date': '20161217',
'timestamp': 1482011700,
'vcodec': 'none',
'categories': ['Esports'],
'series': 'Tot gira',
'duration': 821,
'thumbnail': 'https://img.3cat.cat/multimedia/jpg/8/9/1482002602598.jpg',
},
}, {
'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/',
'md5': 'b43c3d3486f430f3032b5b160d80cbc3',
'url': 'https://www.3cat.cat/3cat/crims-josep-tallada-lespereu-me-part-1/video/6031387/',
'md5': '27493513d08a3e5605814aee9bb778d2',
'info_dict': {
'id': '6031387',
'ext': 'mp4',
'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)',
'title': 'T1xC5 - Josep Talleda, l\'"Espereu-me" (part 1)',
'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60',
'timestamp': 1582577700,
'timestamp': 1582577919,
'upload_date': '20200224',
'subtitles': 'mincount:4',
'age_limit': 16,
'subtitles': 'mincount:1',
'age_limit': 13,
'series': 'Crims',
'thumbnail': 'https://img.3cat.cat/multimedia/jpg/1/9/1582564376991.jpg',
'duration': 3203,
'categories': ['Divulgació'],
'alt_title': 'Crims - 5 - Josep Talleda, l\'"Espereu-me" (1a part) - Josep Talleda, l\'"Espereu-me" (part 1)',
'episode_number': 5,
'episode': 'Episode 5',
},
}, {
'url': 'https://www.3cat.cat/tv3/sx3/una-mosca-volava-per-la-llum/video/5759227/',
'info_dict': {
'id': '5759227',
'ext': 'mp4',
'title': 'Una mosca volava per la llum',
'alt_title': '17Z004Ç UNA MOSCA VOLAVA PER LA LLUM',
'description': 'md5:9ab64276944b0825336f4147f13f7854',
'series': 'Mic',
'upload_date': '20180411',
'timestamp': 1523440105,
'duration': 160,
'age_limit': 0,
'thumbnail': 'https://img.3cat.cat/multimedia/jpg/6/1/1524071667216.jpg',
'categories': ['Música'],
},
}]
def _real_extract(self, url):
media_type, media_id = self._match_valid_url(url).groups()
media_type, media_id = self._match_valid_url(url).group('type', 'id')
media = self._download_json(
'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={
'http://api-media.3cat.cat/pvideo/media.jsp', media_id, query={
'media': media_type,
'idint': media_id,
'format': 'dm',

View File

@@ -12,6 +12,7 @@ from .common import InfoExtractor
from ..compat import compat_ord
from ..utils import (
ExtractorError,
OnDemandPagedList,
float_or_none,
int_or_none,
merge_dicts,
@@ -351,3 +352,50 @@ class CDAIE(InfoExtractor):
extract_format(webpage, resolution)
return merge_dicts(info_dict, info)
class CDAFolderIE(InfoExtractor):
_MAX_PAGE_SIZE = 36
_VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P<channel>\w+)/folder/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://www.cda.pl/domino264/folder/31188385',
'info_dict': {
'id': '31188385',
'title': 'SERIA DRUGA',
},
'playlist_mincount': 13,
},
{
'url': 'https://www.cda.pl/smiechawaTV/folder/2664592/vfilm',
'info_dict': {
'id': '2664592',
'title': 'VideoDowcipy - wszystkie odcinki',
},
'playlist_mincount': 71,
},
{
'url': 'https://www.cda.pl/DeliciousBeauty/folder/19129979/vfilm',
'info_dict': {
'id': '19129979',
'title': 'TESTY KOSMETYKÓW',
},
'playlist_mincount': 139,
}]
def _real_extract(self, url):
folder_id, channel = self._match_valid_url(url).group('id', 'channel')
webpage = self._download_webpage(url, folder_id)
def extract_page_entries(page):
webpage = self._download_webpage(
f'https://www.cda.pl/{channel}/folder/{folder_id}/vfilm/{page + 1}', folder_id,
f'Downloading page {page + 1}', expected_status=404)
items = re.findall(r'<a[^>]+href="/video/([0-9a-z]+)"', webpage)
for video_id in items:
yield self.url_result(f'https://www.cda.pl/video/{video_id}', CDAIE, video_id)
return self.playlist_result(
OnDemandPagedList(extract_page_entries, self._MAX_PAGE_SIZE),
folder_id, self._og_search_title(webpage))

View File

@@ -5,11 +5,12 @@ from ..utils import (
ExtractorError,
lowercase_escape,
url_or_none,
urlencode_postdata,
)
class ChaturbateIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)'
_VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.(?P<tld>com|eu|global)/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://www.chaturbate.com/siswet19/',
'info_dict': {
@@ -29,16 +30,58 @@ class ChaturbateIE(InfoExtractor):
}, {
'url': 'https://en.chaturbate.com/siswet19/',
'only_matching': True,
}, {
'url': 'https://chaturbate.eu/siswet19/',
'only_matching': True,
}, {
'url': 'https://chaturbate.eu/fullvideo/?b=caylin',
'only_matching': True,
}, {
'url': 'https://chaturbate.global/siswet19/',
'only_matching': True,
}]
_ROOM_OFFLINE = 'Room is currently offline'
_ERROR_MAP = {
'offline': 'Room is currently offline',
'private': 'Room is currently in a private show',
'away': 'Performer is currently away',
'password protected': 'Room is password protected',
'hidden': 'Hidden session in progress',
}
def _real_extract(self, url):
video_id = self._match_id(url)
def _extract_from_api(self, video_id, tld):
response = self._download_json(
f'https://chaturbate.{tld}/get_edge_hls_url_ajax/', video_id,
data=urlencode_postdata({'room_slug': video_id}),
headers={
**self.geo_verification_headers(),
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json',
}, fatal=False, impersonate=True) or {}
m3u8_url = response.get('url')
if not m3u8_url:
status = response.get('room_status')
if error := self._ERROR_MAP.get(status):
raise ExtractorError(error, expected=True)
if status == 'public':
self.raise_geo_restricted()
self.report_warning(f'Got status "{status}" from API; falling back to webpage extraction')
return None
return {
'id': video_id,
'title': video_id,
'thumbnail': f'https://roomimg.stream.highwebmedia.com/ri/{video_id}.jpg',
'is_live': True,
'age_limit': 18,
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
}
def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage(
f'https://chaturbate.com/{video_id}/', video_id,
headers=self.geo_verification_headers())
f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True)
found_m3u8_urls = []
@@ -76,8 +119,8 @@ class ChaturbateIE(InfoExtractor):
webpage, 'error', group='error', default=None)
if not error:
if any(p in webpage for p in (
self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')):
error = self._ROOM_OFFLINE
self._ERROR_MAP['offline'], 'offline_tipping', 'tip_offline')):
error = self._ERROR_MAP['offline']
if error:
raise ExtractorError(error, expected=True)
raise ExtractorError('Unable to find stream URL')
@@ -104,3 +147,7 @@ class ChaturbateIE(InfoExtractor):
'is_live': True,
'formats': formats,
}
def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

View File

@@ -1,5 +1,3 @@
import functools
from .common import InfoExtractor
from ..utils import (
UserNotLive,
@@ -77,7 +75,7 @@ class CHZZKLiveIE(InfoExtractor):
'thumbnails': thumbnails,
**traverse_obj(live_detail, {
'title': ('liveTitle', {str}),
'timestamp': ('openDate', {functools.partial(parse_iso8601, delimiter=' ')}),
'timestamp': ('openDate', {parse_iso8601(delimiter=' ')}),
'concurrent_view_count': ('concurrentUserCount', {int_or_none}),
'view_count': ('accumulateCount', {int_or_none}),
'channel': ('channel', 'channelName', {str}),
@@ -146,23 +144,37 @@ class CHZZKVideoIE(InfoExtractor):
video_meta = self._download_json(
f'https://api.chzzk.naver.com/service/v3/videos/{video_id}', video_id,
note='Downloading video info', errnote='Unable to download video info')['content']
formats, subtitles = self._extract_mpd_formats_and_subtitles(
f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id,
query={
'key': video_meta['inKey'],
'env': 'real',
'lc': 'en_US',
'cpl': 'en_US',
}, note='Downloading video playback', errnote='Unable to download video playback')
live_status = 'was_live' if video_meta.get('liveOpenDate') else 'not_live'
video_status = video_meta.get('vodStatus')
if video_status == 'UPLOAD':
playback = self._parse_json(video_meta['liveRewindPlaybackJson'], video_id)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
playback['media'][0]['path'], video_id, 'mp4', m3u8_id='hls')
elif video_status == 'ABR_HLS':
formats, subtitles = self._extract_mpd_formats_and_subtitles(
f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}',
video_id, query={
'key': video_meta['inKey'],
'env': 'real',
'lc': 'en_US',
'cpl': 'en_US',
})
else:
self.raise_no_formats(
f'Unknown video status detected: "{video_status}"', expected=True, video_id=video_id)
formats, subtitles = [], {}
live_status = 'post_live' if live_status == 'was_live' else None
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'live_status': live_status,
**traverse_obj(video_meta, {
'title': ('videoTitle', {str}),
'thumbnail': ('thumbnailImageUrl', {url_or_none}),
'timestamp': ('publishDateAt', {functools.partial(float_or_none, scale=1000)}),
'timestamp': ('publishDateAt', {float_or_none(scale=1000)}),
'view_count': ('readCount', {int_or_none}),
'duration': ('duration', {int_or_none}),
'channel': ('channel', 'channelName', {str}),

View File

@@ -3,6 +3,7 @@ import re
from .common import InfoExtractor
from ..utils import (
filter_dict,
float_or_none,
int_or_none,
parse_age_limit,
smuggle_url,
@@ -85,7 +86,7 @@ class CineverseIE(CineverseBaseIE):
'title': 'title',
'id': ('details', 'item_id'),
'description': ('details', 'description'),
'duration': ('duration', {lambda x: x / 1000}),
'duration': ('duration', {float_or_none(scale=1000)}),
'cast': ('details', 'cast', {lambda x: x.split(', ')}),
'modified_timestamp': ('details', 'updated_by', 0, 'update_time', 'time', {int_or_none}),
'season_number': ('details', 'season', {int_or_none}),

View File

@@ -8,7 +8,7 @@ class CloudflareStreamIE(InfoExtractor):
_DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)'
_EMBED_RE = rf'(?:embed\.|{_SUBDOMAIN_RE}){_DOMAIN_RE}/embed/[^/?#]+\.js\?(?:[^#]+&)?video='
_ID_RE = r'[\da-f]{32}|eyJ[\w-]+\.[\w-]+\.[\w-]+'
_VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P<id>{_ID_RE})'
_VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}(?P<domain>{_DOMAIN_RE})/|{_EMBED_RE})(?P<id>{_ID_RE})'
_EMBED_REGEX = [
rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE})(?:(?!\1).)*)\1',
rf'<iframe[^>]+\bsrc=["\'](?P<url>https?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})',
@@ -19,7 +19,7 @@ class CloudflareStreamIE(InfoExtractor):
'id': '31c9291ab41fac05471db4e73aa11717',
'ext': 'mp4',
'title': '31c9291ab41fac05471db4e73aa11717',
'thumbnail': 'https://videodelivery.net/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg',
'thumbnail': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': 'm3u8',
@@ -30,7 +30,7 @@ class CloudflareStreamIE(InfoExtractor):
'id': '0e8e040aec776862e1d632a699edf59e',
'ext': 'mp4',
'title': '0e8e040aec776862e1d632a699edf59e',
'thumbnail': 'https://videodelivery.net/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg',
'thumbnail': 'https://cloudflarestream.com/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg',
},
}, {
'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1',
@@ -54,7 +54,7 @@ class CloudflareStreamIE(InfoExtractor):
'id': 'eaef9dea5159cf968be84241b5cedfe7',
'ext': 'mp4',
'title': 'eaef9dea5159cf968be84241b5cedfe7',
'thumbnail': 'https://videodelivery.net/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg',
'thumbnail': 'https://cloudflarestream.com/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': 'm3u8',
@@ -62,8 +62,9 @@ class CloudflareStreamIE(InfoExtractor):
}]
def _real_extract(self, url):
video_id = self._match_id(url)
domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net'
video_id, domain = self._match_valid_url(url).group('id', 'domain')
if domain != 'bytehighway.net':
domain = 'cloudflarestream.com'
base_url = f'https://{domain}/{video_id}/'
if '.' in video_id:
video_id = self._parse_json(base64.urlsafe_b64decode(

View File

@@ -1,146 +1,225 @@
import json
import re
from .common import InfoExtractor
from .turner import TurnerBaseIE
from ..utils import merge_dicts, try_call, url_basename
from ..utils import (
clean_html,
extract_attributes,
int_or_none,
merge_dicts,
parse_duration,
parse_iso8601,
parse_resolution,
try_call,
update_url,
url_or_none,
)
from ..utils.traversal import find_elements, traverse_obj
class CNNIE(TurnerBaseIE):
_VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/
(?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
class CNNIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:edition|www|money|cnnespanol)\.)?cnn\.com/(?!audio/)(?P<display_id>[^?#]+?)(?:[?#]|$|/index\.html)'
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
'md5': '3e6121ea48df7e2259fe73a0628605c4',
'url': 'https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl',
'info_dict': {
'id': 'sports/2013/06/09/nadal-1-on-1.cnn',
'id': 'med0e97ad0d154f56e29aa96e57192a14226734b6b',
'display_id': '2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl',
'ext': 'mp4',
'title': 'Nadal wins 8th French Open title',
'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
'duration': 135,
'upload_date': '20130609',
'upload_date': '20240531',
'description': 'md5:844bcdb0629e1877a7a466c913f4c19c',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/gettyimages-2151936122.jpg?c=original',
'duration': 373.0,
'timestamp': 1717148586,
'title': 'Borussia Dortmund star Jadon Sancho seeks Wembley redemption after 2020 Euros hurt',
'modified_date': '20240531',
'modified_timestamp': 1717150140,
},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29',
'md5': 'b5cc60c60a3477d185af8f19a2a26f4e',
'url': 'https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid',
'info_dict': {
'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
'id': 'me522945c4709b299e5cb8657900a7a21ad3b559f9',
'display_id': '2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid',
'ext': 'mp4',
'title': "Student's epic speech stuns new freshmen",
'description': 'A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from "2001: A Space Odyssey."',
'upload_date': '20130821',
'description': 'md5:e0120fe5da9ad8259fd707c1cbb64a60',
'title': 'Heres how some inmates in closely divided state are now able to vote from jail',
'timestamp': 1718158269,
'upload_date': '20240612',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701554-13565-571-still.jpg?c=original',
'duration': 202.0,
'modified_date': '20240612',
'modified_timestamp': 1718158509,
},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html',
'md5': 'f14d02ebd264df951feb2400e2c25a1b',
'url': 'https://edition.cnn.com/2024/06/11/style/king-charles-portrait-vandalized/index.html',
'info_dict': {
'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln',
'id': 'mef5f52b9e1fe28b1ad192afcbc9206ae984894b68',
'display_id': '2024/06/11/style/king-charles-portrait-vandalized',
'ext': 'mp4',
'title': 'Nashville Ep. 1: Hand crafted skateboards',
'description': 'md5:e7223a503315c9f150acac52e76de086',
'upload_date': '20141222',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701257-8846-816-still.jpg?c=original',
'description': 'md5:19f78338ccec533db0fa8a4511012dae',
'title': 'Video shows King Charles\' portrait being vandalized by activists',
'timestamp': 1718113852,
'upload_date': '20240611',
'duration': 51.0,
'modified_timestamp': 1718116193,
'modified_date': '20240611',
},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html',
'md5': '52a515dc1b0f001cd82e4ceda32be9d1',
'url': 'https://edition.cnn.com/videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln',
'info_dict': {
'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney',
'id': 'mefba13799201b084ea3b1d0f7ca820ae94d4bb5b2',
'display_id': 'videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln',
'ext': 'mp4',
'title': '5 stunning stats about Netflix',
'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.',
'upload_date': '20160819',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/221205163510-robin-meade-sign-off.jpg?c=original',
'duration': 158.0,
'title': 'Robin Meade signs off after HLN\'s last broadcast',
'description': 'md5:cff3c62d18d2fbc6c5c75cb029b7353b',
'upload_date': '20221205',
'timestamp': 1670284296,
'modified_timestamp': 1670332404,
'modified_date': '20221206',
},
'params': {
# m3u8 download
'skip_download': True,
'params': {'format': 'direct'},
}, {
'url': 'https://cnnespanol.cnn.com/video/ataque-misil-israel-beirut-libano-octubre-trax',
'info_dict': {
'id': 'me484a43722642aa00627b812fe928f2e99c6e2997',
'ext': 'mp4',
'display_id': 'video/ataque-misil-israel-beirut-libano-octubre-trax',
'timestamp': 1729501452,
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/ataqeubeirut-1.jpg?c=original',
'description': 'md5:256ee7137d161f776cda429654135e52',
'upload_date': '20241021',
'duration': 31.0,
'title': 'VIDEO | Israel lanza un nuevo ataque sobre Beirut',
'modified_date': '20241021',
'modified_timestamp': 1729501530,
},
}, {
'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
'only_matching': True,
}, {
'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg',
'only_matching': True,
}, {
'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn',
'only_matching': True,
'url': 'https://edition.cnn.com/2024/10/16/politics/kamala-harris-fox-news-interview/index.html',
'info_dict': {
'id': '2024/10/16/politics/kamala-harris-fox-news-interview',
},
'playlist_count': 2,
'playlist': [{
'md5': '073ffab87b8bef97c9913e71cc18ef9e',
'info_dict': {
'id': 'me19d548fdd54df0924087039283128ef473ab397d',
'ext': 'mp4',
'title': '\'I\'m not finished\': Harris interview with Fox News gets heated',
'display_id': 'kamala-harris-fox-news-interview-ebof-digvid',
'description': 'md5:e7dd3d1a04df916062230b60ca419a0a',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/harris-20241016234916617.jpg?c=original',
'duration': 173.0,
'timestamp': 1729122182,
'upload_date': '20241016',
'modified_timestamp': 1729194706,
'modified_date': '20241017',
},
'params': {'format': 'direct'},
}, {
'md5': '11604ab4af83b650826753f1ccb8ecff',
'info_dict': {
'id': 'med04507d8ca3da827001f63d22af321ec29c7d97b',
'ext': 'mp4',
'title': '\'Wise\': Buttigieg on Harris\' handling of interview question about gender transition surgery',
'display_id': 'pete-buttigieg-harris-fox-newssrc-digvid',
'description': 'md5:602a8a7e853ed5e574acd3159428c98e',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/buttigieg-20241017040412074.jpg?c=original',
'duration': 145.0,
'timestamp': 1729137765,
'upload_date': '20241017',
'modified_timestamp': 1729138184,
'modified_date': '20241017',
},
'params': {'format': 'direct'},
}],
}]
_CONFIG = {
# http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml
'edition': {
'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml',
'media_src': 'http://pmd.cdn.turner.com/cnn/big',
},
# http://money.cnn.com/.element/apps/cvp2/cfg/config.xml
'money': {
'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml',
'media_src': 'http://ht3.cdn.turner.com/money/big',
},
}
def _extract_timestamp(self, video_data):
# TODO: fix timestamp extraction
return None
def _real_extract(self, url):
sub_domain, path, page_title = self._match_valid_url(url).groups()
if sub_domain not in ('money', 'edition'):
sub_domain = 'edition'
config = self._CONFIG[sub_domain]
return self._extract_cvp_info(
config['data_src'] % path, page_title, {
'default': {
'media_src': config['media_src'],
},
'f4m': {
'host': 'cnn-vh.akamaihd.net',
},
display_id = self._match_valid_url(url).group('display_id')
webpage = self._download_webpage(url, display_id)
app_id = traverse_obj(
self._search_json(r'window\.env\s*=', webpage, 'window env', display_id, default={}),
('TOP_AUTH_SERVICE_APP_ID', {str}))
entries = []
for player_data in traverse_obj(webpage, (
{find_elements(tag='div', attr='data-component-name', value='video-player', html=True)},
..., {extract_attributes}, all, lambda _, v: v['data-media-id'])):
media_id = player_data['data-media-id']
parent_uri = player_data.get('data-video-resource-parent-uri')
formats, subtitles = [], {}
video_data = {}
if parent_uri:
video_data = self._download_json(
'https://fave.api.cnn.io/v1/video', media_id, fatal=False,
query={
'id': media_id,
'stellarUri': parent_uri,
})
for direct_url in traverse_obj(video_data, ('files', ..., 'fileUri', {url_or_none})):
resolution, bitrate = None, None
if mobj := re.search(r'-(?P<res>\d+x\d+)_(?P<tbr>\d+)k\.mp4', direct_url):
resolution, bitrate = mobj.group('res', 'tbr')
formats.append({
'url': direct_url,
'format_id': 'direct',
'quality': 1,
'tbr': int_or_none(bitrate),
**parse_resolution(resolution),
})
for sub_data in traverse_obj(video_data, (
'closedCaptions', 'types', lambda _, v: url_or_none(v['track']['url']), 'track')):
subtitles.setdefault(sub_data.get('lang') or 'en', []).append({
'url': sub_data['url'],
'name': sub_data.get('label'),
})
if app_id:
media_data = self._download_json(
f'https://medium.ngtv.io/v2/media/{media_id}/desktop', media_id, fatal=False,
query={'appId': app_id})
m3u8_url = traverse_obj(media_data, (
'media', 'desktop', 'unprotected', 'unencrypted', 'url', {url_or_none}))
if m3u8_url:
fmts, subs = self._extract_m3u8_formats_and_subtitles(
m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
entries.append({
**traverse_obj(player_data, {
'title': ('data-headline', {clean_html}),
'description': ('data-description', {clean_html}),
'duration': ('data-duration', {parse_duration}),
'timestamp': ('data-publish-date', {parse_iso8601}),
'thumbnail': (
'data-poster-image-override', {json.loads}, 'big', 'uri', {url_or_none},
{update_url(query='c=original')}),
'display_id': 'data-video-slug',
}),
**traverse_obj(video_data, {
'timestamp': ('dateCreated', 'uts', {int_or_none(scale=1000)}),
'description': ('description', {clean_html}),
'title': ('headline', {str}),
'modified_timestamp': ('lastModified', 'uts', {int_or_none(scale=1000)}),
'duration': ('trt', {int_or_none}),
}),
'id': media_id,
'formats': formats,
'subtitles': subtitles,
})
if len(entries) == 1:
return {
**entries[0],
'display_id': display_id,
}
class CNNBlogsIE(InfoExtractor):
_VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+'
_TEST = {
'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/',
'md5': '3e56f97b0b6ffb4b79f4ea0749551084',
'info_dict': {
'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn',
'ext': 'mp4',
'title': 'Criminalizing journalism?',
'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.',
'upload_date': '20140209',
},
'expected_warnings': ['Failed to download m3u8 information'],
'add_ie': ['CNN'],
}
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
return self.url_result(cnn_url, CNNIE.ie_key())
class CNNArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
_TEST = {
'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
'info_dict': {
'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
'ext': 'mp4',
'title': 'Obama: Cyberattack not an act of war',
'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b',
'upload_date': '20141221',
},
'expected_warnings': ['Failed to download m3u8 information'],
'add_ie': ['CNN'],
}
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key())
return self.playlist_result(entries, display_id)
class CNNIndonesiaIE(InfoExtractor):

View File

@@ -25,7 +25,6 @@ import xml.etree.ElementTree
from ..compat import (
compat_etree_fromstring,
compat_expanduser,
compat_os_name,
urllib_req_to_req,
)
from ..cookies import LenientSimpleCookie
@@ -47,6 +46,7 @@ from ..utils import (
FormatSorter,
GeoRestrictedError,
GeoUtils,
ISO639Utils,
LenientJSONDecoder,
Popen,
RegexNotFoundError,
@@ -278,6 +278,7 @@ class InfoExtractor:
thumbnails: A list of dictionaries, with the following entries:
* "id" (optional, string) - Thumbnail format ID
* "url"
* "ext" (optional, string) - actual image extension if not given in URL
* "preference" (optional, int) - quality of the image
* "width" (optional, int)
* "height" (optional, int)
@@ -333,7 +334,7 @@ class InfoExtractor:
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
repost_count: Number of reposts of the video
average_rating: Average rating give by users, the scale used depends on the webpage
average_rating: Average rating given by users, the scale used depends on the webpage
comment_count: Number of comments on the video
comments: A list of comments, each with one or more of the following
properties (all but one of text or html optional):
@@ -520,7 +521,7 @@ class InfoExtractor:
or _extract_from_webpage as necessary. While these are normally classmethods,
_extract_from_webpage is allowed to be an instance method.
_extract_from_webpage may raise self.StopExtraction() to stop further
_extract_from_webpage may raise self.StopExtraction to stop further
processing of the webpage and obtain exclusive rights to it. This is useful
when the extractor cannot reliably be matched using just the URL,
e.g. invidious/peertube instances
@@ -1027,7 +1028,7 @@ class InfoExtractor:
filename = sanitize_filename(f'{basen}.dump', restricted=True)
# Working around MAX_PATH limitation on Windows (see
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
if compat_os_name == 'nt':
if os.name == 'nt':
absfilepath = os.path.abspath(filename)
if len(absfilepath) > 259:
filename = fR'\\?\{absfilepath}'
@@ -1408,6 +1409,13 @@ class InfoExtractor:
return None, None
self.write_debug(f'Using netrc for {netrc_machine} authentication')
# compat: <=py3.10: netrc cannot parse tokens as empty strings, will return `""` instead
# Ref: https://github.com/yt-dlp/yt-dlp/issues/11413
# https://github.com/python/cpython/commit/15409c720be0503131713e3d3abc1acd0da07378
if sys.version_info < (3, 11):
return tuple(x if x != '""' else '' for x in info[::2])
return info[0], info[2]
def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
@@ -1570,7 +1578,9 @@ class InfoExtractor:
if default is not NO_DEFAULT:
fatal = False
for mobj in re.finditer(JSON_LD_RE, html):
json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
json_ld_item = self._parse_json(
mobj.group('json_ld'), video_id, fatal=fatal,
errnote=False if default is not NO_DEFAULT else None)
for json_ld in variadic(json_ld_item):
if isinstance(json_ld, dict):
yield json_ld
@@ -1844,12 +1854,26 @@ class InfoExtractor:
@staticmethod
def _remove_duplicate_formats(formats):
format_urls = set()
seen_urls = set()
seen_fragment_urls = set()
unique_formats = []
for f in formats:
if f['url'] not in format_urls:
format_urls.add(f['url'])
fragments = f.get('fragments')
if callable(fragments):
unique_formats.append(f)
elif fragments:
fragment_urls = frozenset(
fragment.get('url') or urljoin(f['fragment_base_url'], fragment['path'])
for fragment in fragments)
if fragment_urls not in seen_fragment_urls:
seen_fragment_urls.add(fragment_urls)
unique_formats.append(f)
elif f['url'] not in seen_urls:
seen_urls.add(f['url'])
unique_formats.append(f)
formats[:] = unique_formats
def _is_valid_url(self, url, video_id, item='video', headers={}):
@@ -3071,7 +3095,11 @@ class InfoExtractor:
url_pattern = stream.attrib['Url']
stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
stream_name = stream.get('Name')
stream_language = stream.get('Language', 'und')
# IsmFD expects ISO 639 Set 2 language codes (3-character length)
# See: https://github.com/yt-dlp/yt-dlp/issues/11356
stream_language = stream.get('Language') or 'und'
if len(stream_language) != 3:
stream_language = ISO639Utils.short2long(stream_language) or 'und'
for track in stream.findall('QualityLevel'):
KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
@@ -3753,7 +3781,7 @@ class InfoExtractor:
""" Merge subtitle dictionaries, language by language. """
if target is None:
target = {}
for d in dicts:
for d in filter(None, dicts):
for lang, subs in d.items():
target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
return target
@@ -3775,7 +3803,7 @@ class InfoExtractor:
def mark_watched(self, *args, **kwargs):
if not self.get_param('mark_watched', False):
return
if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
if (self.supports_login() and self._get_login_info()[0] is not None) or self._cookies_passed:
self._mark_watched(*args, **kwargs)
def _mark_watched(self, *args, **kwargs):

View File

@@ -12,6 +12,7 @@ from ..utils import (
parse_iso8601,
strip_or_none,
try_get,
urljoin,
)
@@ -112,8 +113,7 @@ class CondeNastIE(InfoExtractor):
m_paths = re.finditer(
r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage)
paths = orderedSet(m.group(1) for m in m_paths)
build_url = lambda path: urllib.parse.urljoin(base_url, path)
entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
entries = [self.url_result(urljoin(base_url, path), 'CondeNast') for path in paths]
return self.playlist_result(entries, playlist_title=title)
def _extract_video_params(self, webpage, display_id):

View File

@@ -456,7 +456,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
}),
}),
**traverse_obj(metadata, {
'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}),
'duration': ('duration_ms', {float_or_none(scale=1000)}),
'timestamp': ('upload_date', {parse_iso8601}),
'series': ('series_title', {str}),
'series_id': ('series_id', {str}),
@@ -484,7 +484,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
}),
}),
**traverse_obj(metadata, {
'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}),
'duration': ('duration_ms', {float_or_none(scale=1000)}),
'age_limit': ('maturity_ratings', -1, {parse_age_limit}),
}),
}

View File

@@ -1,14 +1,27 @@
import json
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import orderedSet
from .ninecninemedia import NineCNineMediaIE
from ..utils import extract_attributes, orderedSet
from ..utils.traversal import find_element, traverse_obj
class CTVNewsIE(InfoExtractor):
_VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)'
_BASE_REGEX = r'https?://(?:[^.]+\.)?ctvnews\.ca/'
_VIDEO_ID_RE = r'(?P<id>\d{5,})'
_PLAYLIST_ID_RE = r'(?P<id>\d\.\d{5,})'
_VALID_URL = [
rf'{_BASE_REGEX}video/c{_VIDEO_ID_RE}',
rf'{_BASE_REGEX}video(?:-gallery)?/?\?clipId={_VIDEO_ID_RE}',
rf'{_BASE_REGEX}video/?\?(?:playlist|bin)Id={_PLAYLIST_ID_RE}',
rf'{_BASE_REGEX}(?!video/)[^?#]*?{_PLAYLIST_ID_RE}/?(?:$|[?#])',
rf'{_BASE_REGEX}(?!video/)[^?#]+\?binId={_PLAYLIST_ID_RE}',
]
_TESTS = [{
'url': 'http://www.ctvnews.ca/video?clipId=901995',
'md5': '9b8624ba66351a23e0b6e1391971f9af',
'md5': 'b608f466c7fa24b9666c6439d766ab7e',
'info_dict': {
'id': '901995',
'ext': 'flv',
@@ -16,6 +29,33 @@ class CTVNewsIE(InfoExtractor):
'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285',
'timestamp': 1467286284,
'upload_date': '20160630',
'categories': [],
'season_number': 0,
'season': 'Season 0',
'tags': [],
'series': 'CTV News National | Archive | Stories 2',
'season_id': '57981',
'thumbnail': r're:https?://.*\.jpg$',
'duration': 764.631,
},
}, {
'url': 'https://barrie.ctvnews.ca/video/c3030933-here_s-what_s-making-news-for-nov--15?binId=1272429',
'md5': '8b8c2b33c5c1803e3c26bc74ff8694d5',
'info_dict': {
'id': '3030933',
'ext': 'flv',
'title': 'Heres whats making news for Nov. 15',
'description': 'Here are the top stories were working on for CTV News at 11 for Nov. 15',
'thumbnail': 'http://images2.9c9media.com/image_asset/2021_2_22_a602e68e-1514-410e-a67a-e1f7cccbacab_png_2000x1125.jpg',
'season_id': '58104',
'season_number': 0,
'tags': [],
'season': 'Season 0',
'categories': [],
'series': 'CTV News Barrie',
'upload_date': '20241116',
'duration': 42.943,
'timestamp': 1731722452,
},
}, {
'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224',
@@ -31,6 +71,72 @@ class CTVNewsIE(InfoExtractor):
'id': '1.2876780',
},
'playlist_mincount': 100,
}, {
'url': 'https://www.ctvnews.ca/it-s-been-23-years-since-toronto-called-in-the-army-after-a-major-snowstorm-1.5736957',
'info_dict':
{
'id': '1.5736957',
},
'playlist_mincount': 6,
}, {
'url': 'https://www.ctvnews.ca/business/respondents-to-bank-of-canada-questionnaire-largely-oppose-creating-a-digital-loonie-1.6665797',
'md5': '24bc4b88cdc17d8c3fc01dfc228ab72c',
'info_dict': {
'id': '2695026',
'ext': 'flv',
'season_id': '89852',
'series': 'From CTV News Channel',
'description': 'md5:796a985a23cacc7e1e2fafefd94afd0a',
'season': '2023',
'title': 'Bank of Canada asks public about digital currency',
'categories': [],
'tags': [],
'upload_date': '20230526',
'season_number': 2023,
'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg',
'timestamp': 1685105157,
'duration': 253.553,
},
}, {
'url': 'https://stox.ctvnews.ca/video-gallery?clipId=582589',
'md5': '135cc592df607d29dddc931f1b756ae2',
'info_dict': {
'id': '582589',
'ext': 'flv',
'categories': [],
'timestamp': 1427906183,
'season_number': 0,
'duration': 125.559,
'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg',
'series': 'CTV News Stox',
'description': 'CTV original footage of the rise and fall of the Berlin Wall.',
'title': 'Berlin Wall',
'season_id': '63817',
'season': 'Season 0',
'tags': [],
'upload_date': '20150401',
},
}, {
'url': 'https://ottawa.ctvnews.ca/features/regional-contact/regional-contact-archive?binId=1.1164587#3023759',
'md5': 'a14c0603557decc6531260791c23cc5e',
'info_dict': {
'id': '3023759',
'ext': 'flv',
'season_number': 2024,
'timestamp': 1731798000,
'season': '2024',
'episode': 'Episode 125',
'description': 'CTV News Ottawa at Six',
'duration': 2712.076,
'episode_number': 125,
'upload_date': '20241116',
'title': 'CTV News Ottawa at Six for Saturday, November 16, 2024',
'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg',
'categories': [],
'tags': [],
'series': 'CTV News Ottawa at Six',
'season_id': '92667',
},
}, {
'url': 'http://www.ctvnews.ca/1.810401',
'only_matching': True,
@@ -42,29 +148,35 @@ class CTVNewsIE(InfoExtractor):
'only_matching': True,
}]
def _ninecninemedia_url_result(self, clip_id):
return self.url_result(f'9c9media:ctvnews_web:{clip_id}', NineCNineMediaIE, clip_id)
def _real_extract(self, url):
page_id = self._match_id(url)
def ninecninemedia_url_result(clip_id):
return {
'_type': 'url_transparent',
'id': clip_id,
'url': f'9c9media:ctvnews_web:{clip_id}',
'ie_key': 'NineCNineMedia',
}
if mobj := re.fullmatch(self._VIDEO_ID_RE, urllib.parse.urlparse(url).fragment):
page_id = mobj.group('id')
if page_id.isdigit():
return ninecninemedia_url_result(page_id)
else:
webpage = self._download_webpage(f'http://www.ctvnews.ca/{page_id}', page_id, query={
'ot': 'example.AjaxPageLayout.ot',
'maxItemsPerPage': 1000000,
})
entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
if not entries:
webpage = self._download_webpage(url, page_id)
if 'getAuthStates("' in webpage:
entries = [ninecninemedia_url_result(clip_id) for clip_id in
self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')]
return self.playlist_result(entries, page_id)
if re.fullmatch(self._VIDEO_ID_RE, page_id):
return self._ninecninemedia_url_result(page_id)
webpage = self._download_webpage(f'https://www.ctvnews.ca/{page_id}', page_id, query={
'ot': 'example.AjaxPageLayout.ot',
'maxItemsPerPage': 1000000,
})
entries = [self._ninecninemedia_url_result(clip_id)
for clip_id in orderedSet(re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
if not entries:
webpage = self._download_webpage(url, page_id)
if 'getAuthStates("' in webpage:
entries = [self._ninecninemedia_url_result(clip_id) for clip_id in
self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')]
else:
entries = [
self._ninecninemedia_url_result(clip_id) for clip_id in
traverse_obj(webpage, (
{find_element(tag='jasper-player-container', html=True)},
{extract_attributes}, 'axis-ids', {json.loads}, ..., 'axisId', {str}))
]
return self.playlist_result(entries, page_id)

View File

@@ -1,7 +1,4 @@
import time
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import int_or_none
@@ -31,9 +28,6 @@ class CultureUnpluggedIE(InfoExtractor):
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
# request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request
self._request_webpage(HEADRequest(
'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id)
movie_data = self._download_json(
f'http://www.cultureunplugged.com/movie-data/cu-{video_id}.json', display_id)

View File

@@ -1,3 +1,4 @@
import functools
import hashlib
import re
import time
@@ -51,6 +52,15 @@ class DacastVODIE(DacastBaseIE):
'thumbnail': 'https://universe-files.dacast.com/26137208-5858-65c1-5e9a-9d6b6bd2b6c2',
},
'params': {'skip_download': 'm3u8'},
}, { # /uspaes/ in hls_url
'url': 'https://iframe.dacast.com/vod/f9823fc6-faba-b98f-0d00-4a7b50a58c5b/348c5c84-b6af-4859-bb9d-1d01009c795b',
'info_dict': {
'id': '348c5c84-b6af-4859-bb9d-1d01009c795b',
'ext': 'mp4',
'title': 'pl1-edyta-rubas-211124.mp4',
'uploader_id': 'f9823fc6-faba-b98f-0d00-4a7b50a58c5b',
'thumbnail': 'https://universe-files.dacast.com/4d0bd042-a536-752d-fc34-ad2fa44bbcbb.png',
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.dacast.com/support/knowledgebase/how-can-i-embed-a-video-on-my-website/',
@@ -74,6 +84,15 @@ class DacastVODIE(DacastBaseIE):
'params': {'skip_download': 'm3u8'},
}]
@functools.cached_property
def _usp_signing_secret(self):
player_js = self._download_webpage(
'https://player.dacast.com/js/player.js', None, 'Downloading player JS')
# Rotates every so often, but hardcode a fallback in case of JS change/breakage before rotation
return self._search_regex(
r'\bUSP_SIGNING_SECRET\s*=\s*(["\'])(?P<secret>(?:(?!\1).)+)', player_js,
'usp signing secret', group='secret', fatal=False) or 'odnInCGqhvtyRTtIiddxtuRtawYYICZP'
def _real_extract(self, url):
user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'}
@@ -94,10 +113,10 @@ class DacastVODIE(DacastBaseIE):
if 'DRM_EXT' in hls_url:
self.report_drm(video_id)
elif '/uspaes/' in hls_url:
# From https://player.dacast.com/js/player.js
# Ref: https://player.dacast.com/js/player.js
ts = int(time.time())
signature = hashlib.sha1(
f'{10413792000 - ts}{ts}YfaKtquEEpDeusCKbvYszIEZnWmBcSvw').digest().hex()
f'{10413792000 - ts}{ts}{self._usp_signing_secret}'.encode()).digest().hex()
hls_aes['uri'] = f'https://keys.dacast.com/uspaes/{video_id}.key?s={signature}&ts={ts}'
for retry in self.RetryManager():

View File

@@ -10,11 +10,14 @@ from ..utils import (
OnDemandPagedList,
age_restricted,
clean_html,
extract_attributes,
int_or_none,
traverse_obj,
try_get,
unescapeHTML,
unsmuggle_url,
update_url,
url_or_none,
urlencode_postdata,
)
@@ -98,12 +101,20 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'''(?ix)
https?://
(?:
dai\.ly/|
(?:
(?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player(?:/\w+)?\.html\?)?video|swf)|
(?:www\.)?lequipe\.fr/video
(?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}|
(?:www\.)?lequipe\.fr
)/
(?:
swf/(?!video)|
(?:(?:crawler|embed|swf)/)?video/|
player(?:/[\da-z]+)?\.html\?(?:video|(?P<is_playlist>playlist))=
)
[/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
'''
)
(?P<id>[^/?_&#]+)(?:[\w-]*\?playlist=(?P<playlist_id>x[0-9a-z]+))?
'''
IE_NAME = 'dailymotion'
_EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1']
_TESTS = [{
@@ -123,7 +134,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'view_count': int,
'like_count': int,
'tags': ['hollywood', 'celeb', 'celebrity', 'movies', 'red carpet'],
'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/K456B1aXqIx58LKWQ/x1080',
'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/K456B1cmt4ZcZ9KiM/x1080',
},
}, {
'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true',
@@ -142,7 +153,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'view_count': int,
'like_count': int,
'tags': ['en_quete_d_esprit'],
'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Tncwi1YNg_RUl7ueu/x1080',
'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Tncwi1clTH6StrxMP/x1080',
},
}, {
'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
@@ -217,6 +228,86 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
}, {
'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video',
'only_matching': True,
}, { # playlist-only
'url': 'https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj',
'only_matching': True,
}, {
'url': 'https://geo.dailymotion.com/player/xmyye.html?video=x93blhi',
'only_matching': True,
}, {
'url': 'https://www.dailymotion.com/crawler/video/x8u4owg',
'only_matching': True,
}, {
'url': 'https://www.dailymotion.com/embed/video/x8u4owg',
'only_matching': True,
}, {
'url': 'https://dai.ly/x94cnnk',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
# https://geo.dailymotion.com/player/xmyye.html?video=x93blhi
'url': 'https://www.financialounge.com/video/2024/08/01/borse-europee-in-rosso-dopo-la-fed-a-milano-volano-mediobanca-e-tim-edizione-del-1-agosto/',
'info_dict': {
'id': 'x93blhi',
'ext': 'mp4',
'title': 'OnAir - 01/08/24',
'description': '',
'duration': 217,
'timestamp': 1722505658,
'upload_date': '20240801',
'uploader': 'Financialounge',
'uploader_id': 'x2vtgmm',
'age_limit': 0,
'tags': [],
'view_count': int,
'like_count': int,
'thumbnail': r're:https://\w+.dmcdn.net/v/WnEY61cmvMxt2Fi6d/x1080',
},
}, {
# https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj
'url': 'https://www.cycleworld.com/blogs/ask-kevin/ducati-continues-to-evolve-with-v4/',
'info_dict': {
'id': 'x7wdsj',
},
'playlist_mincount': 50,
}, {
# https://www.dailymotion.com/crawler/video/x8u4owg
'url': 'https://www.leparisien.fr/environnement/video-le-veloto-la-voiture-a-pedales-qui-aimerait-se-faire-une-place-sur-les-routes-09-03-2024-KCYMCPM4WFHJXMSKBUI66UNFPU.php',
'info_dict': {
'id': 'x8u4owg',
'ext': 'mp4',
'like_count': int,
'uploader': 'Le Parisien',
'thumbnail': 'https://www.leparisien.fr/resizer/ho_GwveeYftNkLwg_cEta--5Bv4=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/BFXJNEBN75EUNHGYJLORUC3TX4.jpg',
'upload_date': '20240309',
'view_count': int,
'timestamp': 1709997866,
'age_limit': 0,
'uploader_id': 'x32f7b',
'title': 'VIDÉO. Le «\xa0véloto\xa0», la voiture à pédales qui aimerait se faire une place sur les routes',
'duration': 428.0,
'description': 'À bord du « véloto », lalternative à la voiture pour la campagne',
'tags': ['biclou', 'vélo', 'véloto', 'campagne', 'voiture', 'environnement', 'véhicules intermédiaires'],
},
}, {
# https://geo.dailymotion.com/player/xry80.html?video=x8vu47w
'url': 'https://www.metatube.com/en/videos/546765/This-frogs-decorates-Christmas-tree/',
'info_dict': {
'id': 'x8vu47w',
'ext': 'mp4',
'like_count': int,
'uploader': 'Metatube',
'thumbnail': r're:https://\w+.dmcdn.net/v/W1G_S1coGSFTfkTeR/x1080',
'upload_date': '20240326',
'view_count': int,
'timestamp': 1711496732,
'age_limit': 0,
'uploader_id': 'x2xpy74',
'title': 'Está lindas ranitas ponen su arbolito',
'duration': 28,
'description': 'Que lindura',
'tags': [],
},
}]
_GEO_BYPASS = False
_COMMON_MEDIA_FIELDS = '''description
@@ -231,17 +322,36 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
yield from super()._extract_embed_urls(url, webpage)
for mobj in re.finditer(
r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage):
yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id')
yield 'https://www.dailymotion.com/embed/video/' + mobj.group('id')
for mobj in re.finditer(
r'(?s)<script [^>]*\bsrc=(["\'])(?:https?:)?//[\w-]+\.dailymotion\.com/player/(?:(?!\1).)+\1[^>]*>', webpage):
attrs = extract_attributes(mobj.group(0))
player_url = url_or_none(attrs.get('src'))
if not player_url:
continue
player_url = player_url.replace('.js', '.html')
if player_url.startswith('//'):
player_url = f'https:{player_url}'
if video_id := attrs.get('data-video'):
query_string = f'video={video_id}'
elif playlist_id := attrs.get('data-playlist'):
query_string = f'playlist={playlist_id}'
else:
continue
yield update_url(player_url, query=query_string)
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url)
video_id, playlist_id = self._match_valid_url(url).groups()
video_id, is_playlist, playlist_id = self._match_valid_url(url).group('id', 'is_playlist', 'playlist_id')
if playlist_id:
if self._yes_playlist(playlist_id, video_id):
return self.url_result(
'http://www.dailymotion.com/playlist/' + playlist_id,
'DailymotionPlaylist', playlist_id)
if is_playlist: # We matched the playlist query param as video_id
playlist_id = video_id
video_id = None
if self._yes_playlist(playlist_id, video_id):
return self.url_result(
f'http://www.dailymotion.com/playlist/{playlist_id}',
'DailymotionPlaylist', playlist_id)
password = self.get_param('videopassword')
media = self._call_api(
@@ -282,6 +392,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
title = metadata['title']
is_live = media.get('isOnAir')
formats = []
subtitles = {}
for quality, media_list in metadata['qualities'].items():
for m in media_list:
media_url = m.get('url')
@@ -289,8 +401,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if not media_url or media_type == 'application/vnd.lumberjack.manifest':
continue
if media_type == 'application/x-mpegURL':
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False))
fmt, subs = self._extract_m3u8_formats_and_subtitles(
media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)
formats.extend(fmt)
self._merge_subtitles(subs, target=subtitles)
else:
f = {
'url': media_url,
@@ -310,20 +424,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if not f.get('fps') and f['format_id'].endswith('@60'):
f['fps'] = 60
subtitles = {}
subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {}
for subtitle_lang, subtitle in subtitles_data.items():
subtitles[subtitle_lang] = [{
'url': subtitle_url,
} for subtitle_url in subtitle.get('urls', [])]
thumbnails = []
for height, poster_url in metadata.get('posters', {}).items():
thumbnails.append({
'height': int_or_none(height),
'id': height,
'url': poster_url,
})
thumbnails = traverse_obj(metadata, (
('posters', 'thumbnails'), {dict.items}, lambda _, v: url_or_none(v[1]), {
'height': (0, {int_or_none}),
'id': (0, {str}),
'url': 1,
}))
owner = metadata.get('owner') or {}
stats = media.get('stats') or {}
@@ -447,7 +559,7 @@ class DailymotionSearchIE(DailymotionPlaylistBaseIE):
class DailymotionUserIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search|crawler)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {

View File

@@ -40,7 +40,7 @@ class DangalPlayBaseIE(InfoExtractor):
'id': ('content_id', {str}),
'title': ('display_title', {str}),
'episode': ('title', {str}),
'series': ('show_name', {str}, {lambda x: x or None}),
'series': ('show_name', {str}, filter),
'series_id': ('catalog_id', {str}),
'duration': ('duration', {int_or_none}),
'release_timestamp': ('release_date_uts', {int_or_none}),

View File

@@ -1,7 +1,10 @@
import time
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
jwt_decode_hs256,
parse_codecs,
try_get,
url_or_none,
@@ -13,9 +16,6 @@ from ..utils.traversal import traverse_obj
class DigitalConcertHallIE(InfoExtractor):
IE_DESC = 'DigitalConcertHall extractor'
_VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/(?P<type>film|concert|work)/(?P<id>[0-9]+)-?(?P<part>[0-9]+)?'
_OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token'
_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15'
_ACCESS_TOKEN = None
_NETRC_MACHINE = 'digitalconcerthall'
_TESTS = [{
'note': 'Playlist with only one video',
@@ -69,59 +69,157 @@ class DigitalConcertHallIE(InfoExtractor):
'params': {'skip_download': 'm3u8'},
'playlist_count': 1,
}]
_LOGIN_HINT = ('Use --username token --password ACCESS_TOKEN where ACCESS_TOKEN '
'is the "access_token_production" from your browser local storage')
_REFRESH_HINT = 'or else use a "refresh_token" with --username refresh --password REFRESH_TOKEN'
_OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token'
_CLIENT_ID = 'dch.webapp'
_CLIENT_SECRET = '2ySLN+2Fwb'
_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15'
_OAUTH_HEADERS = {
'Accept': 'application/json',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Origin': 'https://www.digitalconcerthall.com',
'Referer': 'https://www.digitalconcerthall.com/',
'User-Agent': _USER_AGENT,
}
_access_token = None
_access_token_expiry = 0
_refresh_token = None
def _perform_login(self, username, password):
login_token = self._download_json(
self._OAUTH_URL,
None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({
@property
def _access_token_is_expired(self):
return self._access_token_expiry - 30 <= int(time.time())
def _set_access_token(self, value):
self._access_token = value
self._access_token_expiry = traverse_obj(value, ({jwt_decode_hs256}, 'exp', {int})) or 0
def _cache_tokens(self, /):
self.cache.store(self._NETRC_MACHINE, 'tokens', {
'access_token': self._access_token,
'refresh_token': self._refresh_token,
})
def _fetch_new_tokens(self, invalidate=False):
if invalidate:
self.report_warning('Access token has been invalidated')
self._set_access_token(None)
if not self._access_token_is_expired:
return
if not self._refresh_token:
self._set_access_token(None)
self._cache_tokens()
raise ExtractorError(
'Access token has expired or been invalidated. '
'Get a new "access_token_production" value from your browser '
f'and try again, {self._REFRESH_HINT}', expected=True)
# If we only have a refresh token, we need a temporary "initial token" for the refresh flow
bearer_token = self._access_token or self._download_json(
self._OAUTH_URL, None, 'Obtaining initial token', 'Unable to obtain initial token',
data=urlencode_postdata({
'affiliate': 'none',
'grant_type': 'device',
'device_vendor': 'unknown',
# device_model 'Safari' gets split streams of 4K/HEVC video and lossless/FLAC audio
'device_model': 'unknown' if self._configuration_arg('prefer_combined_hls') else 'Safari',
'app_id': 'dch.webapp',
# device_model 'Safari' gets split streams of 4K/HEVC video and lossless/FLAC audio,
# but this is no longer effective since actual login is not possible anymore
'device_model': 'unknown',
'app_id': self._CLIENT_ID,
'app_distributor': 'berlinphil',
'app_version': '1.84.0',
'client_secret': '2ySLN+2Fwb',
}), headers={
'Accept': 'application/json',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'User-Agent': self._USER_AGENT,
})['access_token']
'app_version': '1.95.0',
'client_secret': self._CLIENT_SECRET,
}), headers=self._OAUTH_HEADERS)['access_token']
try:
login_response = self._download_json(
self._OAUTH_URL,
None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({
'grant_type': 'password',
'username': username,
'password': password,
response = self._download_json(
self._OAUTH_URL, None, 'Refreshing token', 'Unable to refresh token',
data=urlencode_postdata({
'grant_type': 'refresh_token',
'refresh_token': self._refresh_token,
'client_id': self._CLIENT_ID,
'client_secret': self._CLIENT_SECRET,
}), headers={
'Accept': 'application/json',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer': 'https://www.digitalconcerthall.com',
'Authorization': f'Bearer {login_token}',
'User-Agent': self._USER_AGENT,
**self._OAUTH_HEADERS,
'Authorization': f'Bearer {bearer_token}',
})
except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 401:
raise ExtractorError('Invalid username or password', expected=True)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
self._set_access_token(None)
self._refresh_token = None
self._cache_tokens()
raise ExtractorError('Your tokens have been invalidated', expected=True)
raise
self._ACCESS_TOKEN = login_response['access_token']
self._set_access_token(response['access_token'])
if refresh_token := traverse_obj(response, ('refresh_token', {str})):
self.write_debug('New refresh token granted')
self._refresh_token = refresh_token
self._cache_tokens()
def _perform_login(self, username, password):
self.report_login()
if username == 'refresh':
self._refresh_token = password
self._fetch_new_tokens()
if username == 'token':
if not traverse_obj(password, {jwt_decode_hs256}):
raise ExtractorError(
f'The access token passed to yt-dlp is not valid. {self._LOGIN_HINT}', expected=True)
self._set_access_token(password)
self._cache_tokens()
if username in ('refresh', 'token'):
if self.get_param('cachedir') is not False:
token_type = 'access' if username == 'token' else 'refresh'
self.to_screen(f'Your {token_type} token has been cached to disk. To use the cached '
'token next time, pass --username cache along with any password')
return
if username != 'cache':
raise ExtractorError(
'Login with username and password is no longer supported '
f'for this site. {self._LOGIN_HINT}, {self._REFRESH_HINT}', expected=True)
# Try cached access_token
cached_tokens = self.cache.load(self._NETRC_MACHINE, 'tokens', default={})
self._set_access_token(cached_tokens.get('access_token'))
self._refresh_token = cached_tokens.get('refresh_token')
if not self._access_token_is_expired:
return
# Try cached refresh_token
self._fetch_new_tokens(invalidate=True)
def _real_initialize(self):
if not self._ACCESS_TOKEN:
self.raise_login_required(method='password')
if not self._access_token:
self.raise_login_required(
'All content on this site is only available for registered users. '
f'{self._LOGIN_HINT}, {self._REFRESH_HINT}', method=None)
def _entries(self, items, language, type_, **kwargs):
for item in items:
video_id = item['id']
stream_info = self._download_json(
self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={
'Accept': 'application/json',
'Authorization': f'Bearer {self._ACCESS_TOKEN}',
'Accept-Language': language,
'User-Agent': self._USER_AGENT,
})
for should_retry in (True, False):
self._fetch_new_tokens(invalidate=not should_retry)
try:
stream_info = self._download_json(
self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={
'Accept': 'application/json',
'Authorization': f'Bearer {self._access_token}',
'Accept-Language': language,
'User-Agent': self._USER_AGENT,
})
break
except ExtractorError as error:
if should_retry and isinstance(error.cause, HTTPError) and error.cause.status == 401:
continue
raise
formats = []
for m3u8_url in traverse_obj(stream_info, ('channel', ..., 'stream', ..., 'url', {url_or_none})):
@@ -157,7 +255,6 @@ class DigitalConcertHallIE(InfoExtractor):
'Accept': 'application/json',
'Accept-Language': language,
'User-Agent': self._USER_AGENT,
'Authorization': f'Bearer {self._ACCESS_TOKEN}',
})
videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...))

View File

@@ -48,32 +48,30 @@ class DropboxIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
fn = urllib.parse.unquote(url_basename(url))
title = os.path.splitext(fn)[0]
password = self.get_param('videopassword')
content_id = None
for part in self._yield_decoded_parts(webpage):
if '/sm/password' in part:
webpage = self._download_webpage(
update_url('https://www.dropbox.com/sm/password', query=part.partition('?')[2]), video_id)
content_id = self._search_regex(r'content_id=([\w.+=/-]+)', part, 'content ID')
break
if (self._og_search_title(webpage, default=None) == 'Dropbox - Password Required'
or 'Enter the password for this link' in webpage):
if password:
response = self._download_json(
'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password',
headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'},
data=urlencode_postdata({
'is_xhr': 'true',
't': self._get_cookies('https://www.dropbox.com')['t'].value,
'content_id': self._search_regex(r'content_id=([\w.+=/-]+)["\']', webpage, 'content id'),
'password': password,
'url': url,
}))
if response.get('status') != 'authed':
raise ExtractorError('Invalid password', expected=True)
elif not self._get_cookies('https://dropbox.com').get('sm_auth'):
if content_id:
password = self.get_param('videopassword')
if not password:
raise ExtractorError('Password protected video, use --video-password <password>', expected=True)
response = self._download_json(
'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password',
data=urlencode_postdata({
'is_xhr': 'true',
't': self._get_cookies('https://www.dropbox.com')['t'].value,
'content_id': content_id,
'password': password,
'url': update_url(url, scheme='', netloc=''),
}))
if response.get('status') != 'authed':
raise ExtractorError('Invalid password', expected=True)
webpage = self._download_webpage(url, video_id)
formats, subtitles = [], {}

View File

@@ -0,0 +1,51 @@
from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from ..utils import url_or_none
from ..utils.traversal import traverse_obj
class DrTalksIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?drtalks\.com/videos/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://drtalks.com/videos/six-pillars-of-resilience-tools-for-managing-stress-and-flourishing/',
'info_dict': {
'id': '6366193757112',
'ext': 'mp4',
'uploader_id': '6314452011001',
'tags': ['resilience'],
'description': 'md5:9c6805aee237ee6de8052461855b9dda',
'timestamp': 1734546659,
'thumbnail': 'https://drtalks.com/wp-content/uploads/2024/12/Episode-82-Eva-Selhub-DrTalks-Thumbs.jpg',
'title': 'Six Pillars of Resilience: Tools for Managing Stress and Flourishing',
'duration': 2800.682,
'upload_date': '20241218',
},
}, {
'url': 'https://drtalks.com/videos/the-pcos-puzzle-mastering-metabolic-health-with-marcelle-pick/',
'info_dict': {
'id': '6364699891112',
'ext': 'mp4',
'title': 'The PCOS Puzzle: Mastering Metabolic Health with Marcelle Pick',
'description': 'md5:e87cbe00ca50135d5702787fc4043aaa',
'thumbnail': 'https://drtalks.com/wp-content/uploads/2024/11/Episode-34-Marcelle-Pick-OBGYN-NP-DrTalks.jpg',
'duration': 3515.2,
'tags': ['pcos'],
'upload_date': '20241114',
'timestamp': 1731592119,
'uploader_id': '6314452011001',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
next_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data']['video']
return self.url_result(
next_data['videos']['brightcoveVideoLink'], BrightcoveNewIE, video_id,
url_transparent=True,
**traverse_obj(next_data, {
'title': ('title', {str}),
'description': ('videos', 'summury', {str}),
'thumbnail': ('featuredImage', 'node', 'sourceUrl', {url_or_none}),
}))

View File

@@ -5,15 +5,16 @@ from ..utils import (
get_element_text_and_html_by_tag,
int_or_none,
join_nonempty,
parse_qs,
str_or_none,
try_call,
unified_timestamp,
)
from ..utils.traversal import traverse_obj
from ..utils.traversal import traverse_obj, value
class DuoplayIE(InfoExtractor):
_VALID_URL = r'https?://duoplay\.ee/(?P<id>\d+)/[\w-]+/?(?:\?(?:[^#]+&)?ep=(?P<ep>\d+))?'
_VALID_URL = r'https?://duoplay\.ee/(?P<id>\d+)(?:[/?#]|$)'
_TESTS = [{
'note': 'Siberi võmm S02E12',
'url': 'https://duoplay.ee/4312/siberi-vomm?ep=24',
@@ -34,15 +35,16 @@ class DuoplayIE(InfoExtractor):
'episode_number': 12,
'episode_id': '24',
},
'skip': 'No video found',
}, {
'note': 'Empty title',
'url': 'https://duoplay.ee/17/uhikarotid?ep=14',
'md5': '6aca68be71112314738dd17cced7f8bf',
'md5': 'cba9f5dabf2582b224d80ac44fb80e47',
'info_dict': {
'id': '17_14',
'ext': 'mp4',
'title': 'Ühikarotid',
'thumbnail': r're:https://.+\.jpg(?:\?c=\d+)?$',
'title': 'Episode 14',
'thumbnail': r're:https?://.+\.jpg',
'description': 'md5:4719b418e058c209def41d48b601276e',
'upload_date': '20100916',
'timestamp': 1284661800,
@@ -52,6 +54,8 @@ class DuoplayIE(InfoExtractor):
'season_number': 2,
'episode_id': '14',
'release_year': 2010,
'episode': 'Episode 14',
'episode_number': 14,
},
}, {
'note': 'Movie without expiry',
@@ -68,10 +72,32 @@ class DuoplayIE(InfoExtractor):
'timestamp': 1671054000,
'release_year': 2018,
},
'skip': 'No video found',
}, {
'note': 'Episode url without show name',
'url': 'https://duoplay.ee/9644?ep=185',
'md5': '63f324b4fe2dbd8194dca16a6d52184a',
'info_dict': {
'id': '9644_185',
'ext': 'mp4',
'title': 'Episode 185',
'thumbnail': r're:https?://.+\.jpg',
'description': 'md5:ed25ba4e9e5d54bc291a4a0cdd241467',
'upload_date': '20241120',
'timestamp': 1732077000,
'episode': 'Episode 63',
'episode_id': '185',
'episode_number': 63,
'season': 'Season 2',
'season_number': 2,
'series': 'Telehommik',
'series_id': '9644',
},
}]
def _real_extract(self, url):
telecast_id, episode = self._match_valid_url(url).group('id', 'ep')
telecast_id = self._match_id(url)
episode = traverse_obj(parse_qs(url), ('ep', 0, {int_or_none}, {str_or_none}))
video_id = join_nonempty(telecast_id, episode, delim='_')
webpage = self._download_webpage(url, video_id)
video_player = try_call(lambda: extract_attributes(
@@ -79,25 +105,33 @@ class DuoplayIE(InfoExtractor):
if not video_player or not video_player.get('manifest-url'):
raise ExtractorError('No video found', expected=True)
manifest_url = video_player['manifest-url']
session_token = self._download_json(
'https://sts.postimees.ee/session/register', video_id, 'Registering session',
'Unable to register session', headers={
'Accept': 'application/json',
'X-Original-URI': manifest_url,
})['session']
episode_attr = self._parse_json(video_player.get(':episode') or '', video_id, fatal=False) or {}
return {
'id': video_id,
'formats': self._extract_m3u8_formats(video_player['manifest-url'], video_id, 'mp4'),
'formats': self._extract_m3u8_formats(manifest_url, video_id, 'mp4', query={'s': session_token}),
**traverse_obj(episode_attr, {
'title': 'title',
'description': 'synopsis',
'title': ('title', {str}),
'description': ('synopsis', {str}),
'thumbnail': ('images', 'original'),
'timestamp': ('airtime', {lambda x: unified_timestamp(x + ' +0200')}),
'cast': ('cast', {lambda x: x.split(', ')}),
'cast': ('cast', filter, {lambda x: x.split(', ')}),
'release_year': ('year', {int_or_none}),
}),
**(traverse_obj(episode_attr, {
'title': (None, ('subtitle', ('episode_nr', {lambda x: f'Episode {x}' if x else None}))),
'series': 'title',
'title': (None, (('subtitle', {str}, filter), {value(f'Episode {episode}' if episode else None)})),
'series': ('title', {str}),
'series_id': ('telecast_id', {str_or_none}),
'season_number': ('season_id', {int_or_none}),
'episode': 'subtitle',
'episode': ('subtitle', {str}, filter),
'episode_number': ('episode_nr', {int_or_none}),
'episode_id': ('episode_id', {str_or_none}),
}, get_all=False) if episode_attr.get('category') != 'movies' else {}),

View File

@@ -162,7 +162,7 @@ class DVTVIE(InfoExtractor):
items = re.findall(r'(?s)playlist\.push\(({.+?})\);', webpage)
if items:
return self.playlist_result(
[self._parse_video_metadata(i, video_id, timestamp) for i in items],
(self._parse_video_metadata(i, video_id, timestamp) for i in items),
video_id, self._html_search_meta('twitter:title', webpage))
item = self._search_regex(

View File

@@ -207,7 +207,7 @@ class ERRJupiterIE(InfoExtractor):
**traverse_obj(data, {
'title': ('heading', {str}),
'alt_title': ('subHeading', {str}),
'description': (('lead', 'body'), {clean_html}, {lambda x: x or None}),
'description': (('lead', 'body'), {clean_html}, filter),
'timestamp': ('created', {int_or_none}),
'modified_timestamp': ('updated', {int_or_none}),
'release_timestamp': (('scheduleStart', 'publicStart'), {int_or_none}),

View File

@@ -50,7 +50,7 @@ class FacebookIE(InfoExtractor):
[^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/|
events/(?:[^/]+/)?|
groups/[^/]+/(?:permalink|posts)/|
groups/[^/]+/(?:permalink|posts)/(?:[\da-f]+/)?|
watchparty/
)|
facebook:
@@ -410,6 +410,9 @@ class FacebookIE(InfoExtractor):
'uploader': 'Comitato Liberi Pensatori',
'uploader_id': '100065709540881',
},
}, {
'url': 'https://www.facebook.com/groups/1513990329015294/posts/d41d8cd9/2013209885760000/?app=fbl',
'only_matching': True,
}]
_SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
_api_config = {
@@ -563,12 +566,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats):
dash_manifest = traverse_obj(video, 'dash_manifest', 'playlist', expected_type=str)
def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj(
vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest:
formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=video.get('dash_manifest_url')))
mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url))
def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around
@@ -618,16 +622,20 @@ class FacebookIE(InfoExtractor):
video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info)
formats = []
q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')):
playable_url = video.get(key)
playable_url = fmt_data.get(key)
if not playable_url:
continue
if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id))
formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else:
formats.append({
'format_id': format_id,
@@ -635,7 +643,29 @@ class FacebookIE(InfoExtractor):
'quality': q(format_id) - 3,
'url': playable_url,
})
extract_dash_manifest(video, formats)
extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats:
# Do not append false positive entry w/o any formats
return

View File

@@ -193,9 +193,9 @@ class FunimationIE(FunimationBaseIE):
for lang, version, fmt in self._get_experiences(episode):
experience_id = str(fmt['experienceId'])
if (only_initial_experience and experience_id != initial_experience_id
or requested_languages and lang.lower() not in requested_languages
or requested_versions and version.lower() not in requested_versions):
if ((only_initial_experience and experience_id != initial_experience_id)
or (requested_languages and lang.lower() not in requested_languages)
or (requested_versions and version.lower() not in requested_versions)):
continue
thumbnails.append({'url': fmt.get('poster')})
duration = max(duration, fmt.get('duration', 0))

View File

@@ -3,7 +3,7 @@ from .nexx import NexxIE
class FunkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_VALID_URL = r'https?://(?:(?:www|origin|play)\.)?funk\.net/(?:channel|playlist)/[^/?#]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821',
'md5': '8610449476156f338761a75391b0017d',
@@ -27,6 +27,9 @@ class FunkIE(InfoExtractor):
}, {
'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699',
'only_matching': True,
}, {
'url': 'https://play.funk.net/playlist/neuesteVideos/george-floyd-wenn-die-polizei-toetet-der-fall-2004391',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@@ -0,0 +1,141 @@
import json
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
clean_html,
int_or_none,
join_nonempty,
parse_iso8601,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
class GameDevTVDashboardIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gamedev\.tv/dashboard/courses/(?P<course_id>\d+)(?:/(?P<lecture_id>\d+))?'
_NETRC_MACHINE = 'gamedevtv'
_TESTS = [{
'url': 'https://www.gamedev.tv/dashboard/courses/25',
'info_dict': {
'id': '25',
'title': 'Complete Blender Creator 3: Learn 3D Modelling for Beginners',
'tags': ['blender', 'course', 'all', 'box modelling', 'sculpting'],
'categories': ['Blender', '3D Art'],
'thumbnail': 'https://gamedev-files.b-cdn.net/courses/qisc9pmu1jdc.jpg',
'upload_date': '20220516',
'timestamp': 1652694420,
'modified_date': '20241027',
'modified_timestamp': 1730049658,
},
'playlist_count': 100,
}, {
'url': 'https://www.gamedev.tv/dashboard/courses/63/2279',
'info_dict': {
'id': 'df04f4d8-68a4-4756-a71b-9ca9446c3a01',
'ext': 'mp4',
'modified_timestamp': 1701695752,
'upload_date': '20230504',
'episode': 'MagicaVoxel Community Course Introduction',
'series_id': '63',
'title': 'MagicaVoxel Community Course Introduction',
'timestamp': 1683195397,
'modified_date': '20231204',
'categories': ['3D Art', 'MagicaVoxel'],
'season': 'MagicaVoxel Community Course',
'tags': ['MagicaVoxel', 'all', 'course'],
'series': 'MagicaVoxel 3D Art Mini Course',
'duration': 1405,
'episode_number': 1,
'season_number': 1,
'season_id': '219',
'description': 'md5:a378738c5bbec1c785d76c067652d650',
'display_id': '63-219-2279',
'alt_title': '1_CC_MVX MagicaVoxel Community Course Introduction.mp4',
'thumbnail': 'https://vz-23691c65-6fa.b-cdn.net/df04f4d8-68a4-4756-a71b-9ca9446c3a01/thumbnail.jpg',
},
}]
_API_HEADERS = {}
def _perform_login(self, username, password):
try:
response = self._download_json(
'https://api.gamedev.tv/api/students/login', None, 'Logging in',
headers={'Content-Type': 'application/json'},
data=json.dumps({
'email': username,
'password': password,
'cart_items': [],
}).encode())
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
raise ExtractorError('Invalid username/password', expected=True)
raise
self._API_HEADERS['Authorization'] = f'{response["token_type"]} {response["access_token"]}'
def _real_initialize(self):
if not self._API_HEADERS.get('Authorization'):
self.raise_login_required(
'This content is only available with purchase', method='password')
def _entries(self, data, course_id, course_info, selected_lecture):
for section in traverse_obj(data, ('sections', ..., {dict})):
section_info = traverse_obj(section, {
'season_id': ('id', {str_or_none}),
'season': ('title', {str}),
'season_number': ('order', {int_or_none}),
})
for lecture in traverse_obj(section, ('lectures', lambda _, v: url_or_none(v['video']['playListUrl']))):
if selected_lecture and str(lecture.get('id')) != selected_lecture:
continue
display_id = join_nonempty(course_id, section_info.get('season_id'), lecture.get('id'))
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
lecture['video']['playListUrl'], display_id, 'mp4', m3u8_id='hls')
yield {
**course_info,
**section_info,
'id': display_id, # fallback
'display_id': display_id,
'formats': formats,
'subtitles': subtitles,
'series': course_info.get('title'),
'series_id': course_id,
**traverse_obj(lecture, {
'id': ('video', 'guid', {str}),
'title': ('title', {str}),
'alt_title': ('video', 'title', {str}),
'description': ('description', {clean_html}),
'episode': ('title', {str}),
'episode_number': ('order', {int_or_none}),
'duration': ('video', 'duration_in_sec', {int_or_none}),
'timestamp': ('video', 'created_at', {parse_iso8601}),
'modified_timestamp': ('video', 'updated_at', {parse_iso8601}),
'thumbnail': ('video', 'thumbnailUrl', {url_or_none}),
}),
}
def _real_extract(self, url):
course_id, lecture_id = self._match_valid_url(url).group('course_id', 'lecture_id')
data = self._download_json(
f'https://api.gamedev.tv/api/courses/my/{course_id}', course_id,
headers=self._API_HEADERS)['data']
course_info = traverse_obj(data, {
'title': ('title', {str}),
'tags': ('tags', ..., 'name', {str}),
'categories': ('categories', ..., 'title', {str}),
'timestamp': ('created_at', {parse_iso8601}),
'modified_timestamp': ('updated_at', {parse_iso8601}),
'thumbnail': ('image', {url_or_none}),
})
entries = self._entries(data, course_id, course_info, lecture_id)
if lecture_id:
lecture = next(entries, None)
if not lecture:
raise ExtractorError('Lecture not found')
return lecture
return self.playlist_result(entries, course_id, **course_info)

View File

@@ -8,6 +8,8 @@ from .common import InfoExtractor
from .commonprotocols import RtmpIE
from .youtube import YoutubeIE
from ..compat import compat_etree_fromstring
from ..cookies import LenientSimpleCookie
from ..networking.exceptions import HTTPError
from ..networking.impersonate import ImpersonateTarget
from ..utils import (
KNOWN_EXTENSIONS,
@@ -2374,10 +2376,9 @@ class GenericIE(InfoExtractor):
else:
video_id = self._generic_id(url)
# Try to impersonate a web-browser by default if possible
# Skip impersonation if not available to omit the warning
impersonate = self._configuration_arg('impersonate', [''])
if 'false' in impersonate or not self._downloader._impersonate_target_available(ImpersonateTarget()):
# Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335
impersonate = self._configuration_arg('impersonate', ['false'])
if 'false' in impersonate:
impersonate = None
# Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
@@ -2388,10 +2389,29 @@ class GenericIE(InfoExtractor):
# to accept raw bytes and being able to download only a chunk.
# It may probably better to solve this by checking Content-Type for application/octet-stream
# after a HEAD request, but not sure if we can rely on this.
full_response = self._request_webpage(url, video_id, headers=filter_dict({
'Accept-Encoding': 'identity',
'Referer': smuggled_data.get('referer'),
}), impersonate=impersonate)
try:
full_response = self._request_webpage(url, video_id, headers=filter_dict({
'Accept-Encoding': 'identity',
'Referer': smuggled_data.get('referer'),
}), impersonate=impersonate)
except ExtractorError as e:
if not (isinstance(e.cause, HTTPError) and e.cause.status == 403
and e.cause.response.get_header('cf-mitigated') == 'challenge'
and e.cause.response.extensions.get('impersonate') is None):
raise
cf_cookie_domain = traverse_obj(
LenientSimpleCookie(e.cause.response.get_header('set-cookie')),
('__cf_bm', 'domain'))
if cf_cookie_domain:
self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}')
self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm')
msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; '
if not self._downloader._impersonate_target_available(ImpersonateTarget()):
msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for '
'how to install the required impersonation dependency, and ')
raise ExtractorError(
f'{msg}try again with --extractor-args "generic:impersonate"', expected=True)
new_url = full_response.url
if new_url != extract_basic_auth(url)[0]:
self.report_following_redirect(new_url)

View File

@@ -5,56 +5,63 @@ import hashlib
import hmac
import json
import os
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
remove_end,
traverse_obj,
unescapeHTML,
)
class GoPlayIE(InfoExtractor):
_VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/]+/[^/]+/|)(?P<display_id>[^/#]+)'
_VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/?#]+/[^/?#]+/|)(?P<id>[^/#]+)'
_NETRC_MACHINE = 'goplay'
_TESTS = [{
'url': 'https://www.goplay.be/video/de-container-cup/de-container-cup-s3/de-container-cup-s3-aflevering-2#autoplay',
'url': 'https://www.goplay.be/video/de-slimste-mens-ter-wereld/de-slimste-mens-ter-wereld-s22/de-slimste-mens-ter-wereld-s22-aflevering-1',
'info_dict': {
'id': '9c4214b8-e55d-4e4b-a446-f015f6c6f811',
'id': '2baa4560-87a0-421b-bffc-359914e3c387',
'ext': 'mp4',
'title': 'S3 - Aflevering 2',
'series': 'De Container Cup',
'season': 'Season 3',
'season_number': 3,
'episode': 'Episode 2',
'episode_number': 2,
'title': 'S22 - Aflevering 1',
'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}',
'series': 'De Slimste Mens ter Wereld',
'episode': 'Episode 1',
'season_number': 22,
'episode_number': 1,
'season': 'Season 22',
},
'params': {'skip_download': True},
'skip': 'This video is only available for registered users',
}, {
'url': 'https://www.goplay.be/video/a-family-for-thr-holidays-s1-aflevering-1#autoplay',
'url': 'https://www.goplay.be/video/1917',
'info_dict': {
'id': '74e3ed07-748c-49e4-85a0-393a93337dbf',
'id': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
'ext': 'mp4',
'title': 'A Family for the Holidays',
'title': '1917',
'description': r're:Op het hoogtepunt van de Eerste Wereldoorlog krijgen twee jonge .{94}',
},
'params': {'skip_download': True},
'skip': 'This video is only available for registered users',
}, {
'url': 'https://www.goplay.be/video/de-mol/de-mol-s11/de-mol-s11-aflevering-1#autoplay',
'info_dict': {
'id': '03eb8f2f-153e-41cb-9805-0d3a29dab656',
'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee',
'ext': 'mp4',
'title': 'S11 - Aflevering 1',
'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}',
'episode': 'Episode 1',
'series': 'De Mol',
'season_number': 11,
'episode_number': 1,
'season': 'Season 11',
},
'params': {
'skip_download': True,
},
'params': {'skip_download': True},
'skip': 'This video is only available for registered users',
}]
@@ -69,27 +76,42 @@ class GoPlayIE(InfoExtractor):
if not self._id_token:
raise self.raise_login_required(method='password')
def _real_extract(self, url):
url, display_id = self._match_valid_url(url).group(0, 'display_id')
webpage = self._download_webpage(url, display_id)
video_data_json = self._html_search_regex(r'<div\s+data-hero="([^"]+)"', webpage, 'video_data')
video_data = self._parse_json(unescapeHTML(video_data_json), display_id).get('data')
def _find_json(self, s):
return self._search_json(
r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None)
movie = video_data.get('movie')
if movie:
video_id = movie['videoUuid']
info_dict = {
'title': movie.get('title'),
}
else:
episode = traverse_obj(video_data, ('playlists', ..., 'episodes', lambda _, v: v['pageInfo']['url'] == url), get_all=False)
video_id = episode['videoUuid']
info_dict = {
'title': episode.get('episodeTitle'),
'series': traverse_obj(episode, ('program', 'title')),
'season_number': episode.get('seasonNumber'),
'episode_number': episode.get('episodeNumber'),
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
nextjs_data = traverse_obj(
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage),
(..., {js_to_json}, {json.loads}, ..., {self._find_json}, ...))
meta = traverse_obj(nextjs_data, (
..., lambda _, v: v['meta']['path'] == urllib.parse.urlparse(url).path, 'meta', any))
video_id = meta['uuid']
info_dict = traverse_obj(meta, {
'title': ('title', {str}),
'description': ('description', {str.strip}),
})
if traverse_obj(meta, ('program', 'subtype')) != 'movie':
for season_data in traverse_obj(nextjs_data, (..., 'children', ..., 'playlists', ...)):
episode_data = traverse_obj(
season_data, ('videos', lambda _, v: v['videoId'] == video_id, any))
if not episode_data:
continue
episode_title = traverse_obj(
episode_data, 'contextualTitle', 'episodeTitle', expected_type=str)
info_dict.update({
'title': episode_title or info_dict.get('title'),
'series': remove_end(info_dict.get('title'), f' - {episode_title}'),
'season_number': traverse_obj(season_data, ('season', {int_or_none})),
'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})),
})
break
api = self._download_json(
f'https://api.goplay.be/web/v1/videos/long-form/{video_id}',

View File

@@ -1,4 +1,3 @@
import functools
from .common import InfoExtractor
from ..utils import (
@@ -63,7 +62,7 @@ class IlPostIE(InfoExtractor):
'url': ('podcast_raw_url', {url_or_none}),
'thumbnail': ('image', {url_or_none}),
'timestamp': ('timestamp', {int_or_none}),
'duration': ('milliseconds', {functools.partial(float_or_none, scale=1000)}),
'duration': ('milliseconds', {float_or_none(scale=1000)}),
'availability': ('free', {lambda v: 'public' if v else 'subscriber_only'}),
}),
}

View File

@@ -37,7 +37,7 @@ class ImgurBaseIE(InfoExtractor):
class ImgurIE(ImgurBaseIE):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?P<id>[a-zA-Z0-9]+)'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://imgur.com/A61SaA1',
@@ -54,6 +54,22 @@ class ImgurIE(ImgurBaseIE):
'like_count': int,
'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg',
},
}, {
# Test with URL slug
'url': 'https://imgur.com/mrw-gifv-is-up-running-without-any-bugs-A61SaA1',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 'MRW gifv is up and running without any bugs',
'timestamp': 1416446068,
'upload_date': '20141120',
'dislike_count': int,
'comment_count': int,
'release_timestamp': 1416446068,
'release_date': '20141120',
'like_count': int,
'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg',
},
}, {
'url': 'https://i.imgur.com/A61SaA1.gifv',
'only_matching': True,
@@ -92,6 +108,7 @@ class ImgurIE(ImgurBaseIE):
'comment_count': int,
'release_timestamp': 1710491255,
'release_date': '20240315',
'thumbnail': 'https://i.imgur.com/zV03bd5h.jpg',
},
}]
@@ -208,7 +225,10 @@ class ImgurIE(ImgurBaseIE):
}), get_all=False),
'id': video_id,
'formats': formats,
'thumbnail': url_or_none(search('thumbnailUrl')),
'thumbnails': [{
'url': thumbnail_url,
'http_headers': {'Accept': '*/*'},
}] if (thumbnail_url := search(['thumbnailUrl', 'twitter:image', 'og:image'])) else None,
'http_headers': {'Accept': '*/*'},
}
@@ -252,17 +272,9 @@ class ImgurGalleryBaseIE(ImgurBaseIE):
class ImgurGalleryIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:gallery'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?P<id>[a-zA-Z0-9]+)'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'http://imgur.com/gallery/Q95ko',
'info_dict': {
'id': 'Q95ko',
'title': 'Adding faces make every GIF better',
},
'playlist_count': 25,
'skip': 'Zoinks! You\'ve taken a wrong turn.',
}, {
# TODO: static images - replace with animated/video gallery
'url': 'http://imgur.com/topic/Aww/ll5Vk',
'only_matching': True,
@@ -280,7 +292,27 @@ class ImgurGalleryIE(ImgurGalleryBaseIE):
'release_timestamp': 1358554297,
'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg',
'release_date': '20130119',
'uploader_url': 'https://i.imgur.com/u3R4I2S_d.png?maxwidth=290&fidelity=grand',
'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand',
'comment_count': int,
'dislike_count': int,
'like_count': int,
},
}, {
# Test with slug
'url': 'https://imgur.com/gallery/classic-steve-carell-gif-cracks-me-up-everytime-repost-downvotes-YcAQlkx',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'YcAQlkx',
'ext': 'mp4',
'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
'timestamp': 1358554297,
'upload_date': '20130119',
'uploader_id': '1648642',
'uploader': 'wittyusernamehere',
'release_timestamp': 1358554297,
'release_date': '20130119',
'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg',
'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand',
'comment_count': int,
'dislike_count': int,
'like_count': int,
@@ -317,6 +349,13 @@ class ImgurGalleryIE(ImgurGalleryBaseIE):
'title': 'Penguins !',
},
'playlist_count': 3,
}, {
'url': 'https://imgur.com/t/unmuted/penguins-penguins-6lAn9VQ',
'info_dict': {
'id': '6lAn9VQ',
'title': 'Penguins !',
},
'playlist_count': 3,
}, {
'url': 'https://imgur.com/t/unmuted/kx2uD3C',
'add_ies': ['Imgur'],
@@ -357,7 +396,7 @@ class ImgurGalleryIE(ImgurGalleryBaseIE):
class ImgurAlbumIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:album'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)'
_GALLERY = False
_TESTS = [{
# TODO: only static images - replace with animated/video gallery
@@ -372,6 +411,14 @@ class ImgurAlbumIE(ImgurGalleryBaseIE):
'title': 'enen-no-shouboutai',
},
'playlist_count': 2,
}, {
# Test with URL slug
'url': 'https://imgur.com/a/enen-no-shouboutai-iX265HX',
'info_dict': {
'id': 'iX265HX',
'title': 'enen-no-shouboutai',
},
'playlist_count': 2,
}, {
'url': 'https://imgur.com/a/8pih2Ed',
'info_dict': {

View File

@@ -254,7 +254,7 @@ class InstagramIOSIE(InfoExtractor):
class InstagramIE(InstagramBaseIE):
_VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reels?(?!/audio/))/(?P<id>[^/?#&]+))'
_VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/(?!share/)[^/?#]+)?/(?:p|tv|reels?(?!/audio/))/(?P<id>[^/?#&]+))'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1']
_TESTS = [{
'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',

View File

@@ -326,11 +326,11 @@ class JioCinemaIE(JioCinemaBaseIE):
# fallback metadata
'title': ('name', {str}),
'description': ('fullSynopsis', {str}),
'series': ('show', 'name', {str}, {lambda x: x or None}),
'series': ('show', 'name', {str}, filter),
'season': ('tournamentName', {str}, {lambda x: x if x != 'Season 0' else None}),
'season_number': ('episode', 'season', {int_or_none}, {lambda x: x or None}),
'season_number': ('episode', 'season', {int_or_none}, filter),
'episode': ('fullTitle', {str}),
'episode_number': ('episode', 'episodeNo', {int_or_none}, {lambda x: x or None}),
'episode_number': ('episode', 'episodeNo', {int_or_none}, filter),
'age_limit': ('ageNemonic', {parse_age_limit}),
'duration': ('totalDuration', {float_or_none}),
'thumbnail': ('images', {url_or_none}),
@@ -338,10 +338,10 @@ class JioCinemaIE(JioCinemaBaseIE):
**traverse_obj(metadata, ('result', 0, {
'title': ('fullTitle', {str}),
'description': ('fullSynopsis', {str}),
'series': ('showName', {str}, {lambda x: x or None}),
'season': ('seasonName', {str}, {lambda x: x or None}),
'series': ('showName', {str}, filter),
'season': ('seasonName', {str}, filter),
'season_number': ('season', {int_or_none}),
'season_id': ('seasonId', {str}, {lambda x: x or None}),
'season_id': ('seasonId', {str}, filter),
'episode': ('fullTitle', {str}),
'episode_number': ('episode', {int_or_none}),
'timestamp': ('uploadTime', {int_or_none}),

160
yt_dlp/extractor/kenh14.py Normal file
View File

@@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

View File

@@ -1,4 +1,3 @@
import functools
from .common import InfoExtractor
from ..networking import HEADRequest
@@ -137,7 +136,7 @@ class KickVODIE(KickBaseIE):
'uploader': ('livestream', 'channel', 'user', 'username', {str}),
'uploader_id': ('livestream', 'channel', 'user_id', {int}, {str_or_none}),
'timestamp': ('created_at', {parse_iso8601}),
'duration': ('livestream', 'duration', {functools.partial(float_or_none, scale=1000)}),
'duration': ('livestream', 'duration', {float_or_none(scale=1000)}),
'thumbnail': ('livestream', 'thumbnail', {url_or_none}),
'categories': ('livestream', 'categories', ..., 'name', {str}),
'view_count': ('views', {int_or_none}),

View File

@@ -119,7 +119,7 @@ class KikaIE(InfoExtractor):
'width': ('frameWidth', {int_or_none}),
'height': ('frameHeight', {int_or_none}),
# NB: filesize is 0 if unknown, bitrate is -1 if unknown
'filesize': ('fileSize', {int_or_none}, {lambda x: x or None}),
'filesize': ('fileSize', {int_or_none}, filter),
'abr': ('bitrateAudio', {int_or_none}, {lambda x: None if x == -1 else x}),
'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}),
}),

View File

@@ -32,7 +32,7 @@ class LaracastsBaseIE(InfoExtractor):
VimeoIE, url_transparent=True,
**traverse_obj(episode, {
'id': ('id', {int}, {str_or_none}),
'webpage_url': ('path', {lambda x: urljoin('https://laracasts.com', x)}),
'webpage_url': ('path', {urljoin('https://laracasts.com')}),
'title': ('title', {clean_html}),
'season_number': ('chapter', {int_or_none}),
'episode_number': ('position', {int_or_none}),
@@ -104,7 +104,7 @@ class LaracastsPlaylistIE(LaracastsBaseIE):
'description': ('body', {clean_html}),
'thumbnail': (('large_thumbnail', 'thumbnail'), {url_or_none}, any),
'duration': ('runTime', {parse_duration}),
'categories': ('taxonomy', 'name', {str}, {lambda x: x and [x]}),
'categories': ('taxonomy', 'name', {str}, all, filter),
'tags': ('topics', ..., 'name', {str}),
'modified_date': ('lastUpdated', {unified_strdate}),
}),

View File

@@ -66,7 +66,7 @@ class LBRYBaseIE(InfoExtractor):
'license': ('value', 'license', {str}),
'timestamp': ('timestamp', {int_or_none}),
'release_timestamp': ('value', 'release_time', {int_or_none}),
'tags': ('value', 'tags', ..., {lambda x: x or None}),
'tags': ('value', 'tags', ..., filter),
'duration': ('value', stream_type, 'duration', {int_or_none}),
'channel': ('signing_channel', 'value', 'title', {str}),
'channel_id': ('signing_channel', 'claim_id', {str}),
@@ -136,6 +136,7 @@ class LBRYBaseIE(InfoExtractor):
class LBRYIE(LBRYBaseIE):
IE_NAME = 'lbry'
IE_DESC = 'odysee.com'
_VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'''
(?:\$/(?:download|embed)/)?
(?P<id>
@@ -364,6 +365,7 @@ class LBRYIE(LBRYBaseIE):
class LBRYChannelIE(LBRYBaseIE):
IE_NAME = 'lbry:channel'
IE_DESC = 'odysee.com channels'
_VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'(?P<id>@{LBRYBaseIE._OPT_CLAIM_ID})/?(?:[?&]|$)'
_TESTS = [{
'url': 'https://lbry.tv/@LBRYFoundation:0',
@@ -391,6 +393,7 @@ class LBRYChannelIE(LBRYBaseIE):
class LBRYPlaylistIE(LBRYBaseIE):
IE_NAME = 'lbry:playlist'
IE_DESC = 'odysee.com playlists'
_VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'\$/(?:play)?list/(?P<id>[0-9a-f-]+)'
_TESTS = [{
'url': 'https://odysee.com/$/playlist/ffef782f27486f0ac138bde8777f72ebdd0548c2',

View File

@@ -6,13 +6,11 @@ from ..utils import (
ExtractorError,
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_id,
join_nonempty,
parse_duration,
unified_timestamp,
)
from ..utils.traversal import traverse_obj
from ..utils.traversal import find_element, traverse_obj
class LearningOnScreenIE(InfoExtractor):
@@ -32,28 +30,24 @@ class LearningOnScreenIE(InfoExtractor):
def _real_initialize(self):
if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'):
self.raise_login_required(
'Use --cookies for authentication. See '
' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp '
'for how to manually pass cookies', method=None)
self.raise_login_required(method='session_cookies')
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
details = traverse_obj(webpage, (
{functools.partial(get_element_html_by_id, 'programme-details')}, {
'title': ({functools.partial(re.search, r'<h2>([^<]+)</h2>')}, 1, {clean_html}),
{find_element(id='programme-details', html=True)}, {
'title': ({find_element(tag='h2')}, {clean_html}),
'timestamp': (
{functools.partial(get_element_by_class, 'broadcast-date')},
{find_element(cls='broadcast-date')},
{functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}),
'duration': (
{functools.partial(get_element_by_class, 'prog-running-time')},
{clean_html}, {parse_duration}),
{find_element(cls='prog-running-time')}, {clean_html}, {parse_duration}),
}))
title = details.pop('title', None) or traverse_obj(webpage, (
{functools.partial(get_element_html_by_id, 'add-to-existing-playlist')},
{find_element(id='add-to-existing-playlist', html=True)},
{extract_attributes}, 'data-record-title', {clean_html}))
entries = self._parse_html5_media_entries(

View File

@@ -6,12 +6,10 @@ from ..utils import (
extract_attributes,
get_element_by_class,
get_element_html_by_id,
get_element_text_and_html_by_tag,
parse_duration,
strip_or_none,
traverse_obj,
try_call,
)
from ..utils.traversal import find_element, traverse_obj
class ListenNotesIE(InfoExtractor):
@@ -22,14 +20,14 @@ class ListenNotesIE(InfoExtractor):
'info_dict': {
'id': 'KrDgvNb_u1n',
'ext': 'mp3',
'title': 'md5:32236591a921adf17bbdbf0441b6c0e9',
'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd',
'duration': 2148.0,
'channel': 'Thriving on Overload',
'title': r're:Tim OReilly on noticing things other people .{113}',
'description': r're:(?s)We shape reality by what we notice and .{27459}',
'duration': 2215.0,
'channel': 'Amplifying Cognition',
'channel_id': 'ed84wITivxF',
'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg',
'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/',
'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/amplifying-cognition-ross-dawson-Iemft4Gdr0k-ed84wITivxF.300x300.jpg',
'channel_url': 'https://www.listennotes.com/podcasts/amplifying-cognition-ross-dawson-ed84wITivxF/',
'cast': ['Tim OReilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
},
}, {
@@ -39,13 +37,13 @@ class ListenNotesIE(InfoExtractor):
'id': 'lwEA3154JzG',
'ext': 'mp3',
'title': 'Episode 177: WireGuard with Jason Donenfeld',
'description': 'md5:24744f36456a3e95f83c1193a3458594',
'description': r're:(?s)Jason Donenfeld lead developer joins us this hour to discuss WireGuard, .{3169}',
'duration': 3861.0,
'channel': 'Ask Noah Show',
'channel_id': '4DQTzdS5-j7',
'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg',
'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-gD7vG150cxf-4DQTzdS5-j7.300x300.jpg',
'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
},
}]
@@ -70,7 +68,7 @@ class ListenNotesIE(InfoExtractor):
'id': audio_id,
'url': data['audio'],
'title': (data.get('data-title')
or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
or traverse_obj(webpage, ({find_element(tag='h1')}, {clean_html}))
or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')),
'description': (self._clean_description(get_element_by_class('ln-text-p', webpage))
or strip_or_none(description)),

View File

@@ -1,30 +1,32 @@
import json
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
join_nonempty,
smuggle_url,
traverse_obj,
try_call,
unsmuggle_url,
urljoin,
)
class LiTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)'
_URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?content_id=%s'
_VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:[^/?#]+/watch/|vod/[^/?#]+/content\.do\?content_id=)(?P<id>[\w-]+)'
_URL_TEMPLATE = 'https://www.litv.tv/%s/watch/%s'
_GEO_COUNTRIES = ['TW']
_TESTS = [{
'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
'url': 'https://www.litv.tv/drama/watch/VOD00041610',
'info_dict': {
'id': 'VOD00041606',
'title': '花千骨',
},
'playlist_count': 51, # 50 episodes + 1 trailer
}, {
'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
'url': 'https://www.litv.tv/drama/watch/VOD00041610',
'md5': 'b90ff1e9f1d8f5cfcd0a44c3e2b34c7a',
'info_dict': {
'id': 'VOD00041610',
@@ -32,16 +34,15 @@ class LiTVIE(InfoExtractor):
'title': '花千骨第1集',
'thumbnail': r're:https?://.*\.jpg$',
'description': '《花千骨》陸劇線上看。十六年前,平靜的村莊內,一名女嬰隨異相出生,途徑此地的蜀山掌門清虛道長算出此女命運非同一般,她體內散發的異香易招惹妖魔。一念慈悲下,他在村莊周邊設下結界阻擋妖魔入侵,讓其年滿十六後去蜀山,並賜名花千骨。',
'categories': ['奇幻', '愛情', '中國', '仙俠'],
'categories': ['奇幻', '愛情', '仙俠', '古裝'],
'episode': 'Episode 1',
'episode_number': 1,
},
'params': {
'noplaylist': True,
},
'skip': 'Georestricted to Taiwan',
}, {
'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&',
'url': 'https://www.litv.tv/drama/watch/VOD00044841',
'md5': '88322ea132f848d6e3e18b32a832b918',
'info_dict': {
'id': 'VOD00044841',
@@ -55,94 +56,62 @@ class LiTVIE(InfoExtractor):
def _extract_playlist(self, playlist_data, content_type):
all_episodes = [
self.url_result(smuggle_url(
self._URL_TEMPLATE % (content_type, episode['contentId']),
self._URL_TEMPLATE % (content_type, episode['content_id']),
{'force_noplaylist': True})) # To prevent infinite recursion
for episode in traverse_obj(playlist_data, ('seasons', ..., 'episode', lambda _, v: v['contentId']))]
for episode in traverse_obj(playlist_data, ('seasons', ..., 'episodes', lambda _, v: v['content_id']))]
return self.playlist_result(all_episodes, playlist_data['contentId'], playlist_data.get('title'))
return self.playlist_result(all_episodes, playlist_data['content_id'], playlist_data.get('title'))
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
vod_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']
if self._search_regex(
r'(?i)<meta\s[^>]*http-equiv="refresh"\s[^>]*content="[0-9]+;\s*url=https://www\.litv\.tv/"',
webpage, 'meta refresh redirect', default=False, group=0):
raise ExtractorError('No such content found', expected=True)
program_info = traverse_obj(vod_data, ('programInformation', {dict})) or {}
playlist_data = traverse_obj(vod_data, ('seriesTree'))
if playlist_data and self._yes_playlist(program_info.get('series_id'), video_id, smuggled_data):
return self._extract_playlist(playlist_data, program_info.get('content_type'))
program_info = self._parse_json(self._search_regex(
r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'),
video_id)
asset_id = traverse_obj(program_info, ('assets', 0, 'asset_id', {str}))
if asset_id: # This is a VOD
media_type = 'vod'
else: # This is a live stream
asset_id = program_info['content_id']
media_type = program_info['content_type']
puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value)
if puid:
endpoint = 'get-urls'
else:
puid = str(uuid.uuid4())
endpoint = 'get-urls-no-auth'
video_data = self._download_json(
f'https://www.litv.tv/api/{endpoint}', video_id,
data=json.dumps({'AssetId': asset_id, 'MediaType': media_type, 'puid': puid}).encode(),
headers={'Content-Type': 'application/json'})
# In browsers `getProgramInfo` request is always issued. Usually this
# endpoint gives the same result as the data embedded in the webpage.
# If, for some reason, there are no embedded data, we do an extra request.
if 'assetId' not in program_info:
program_info = self._download_json(
'https://www.litv.tv/vod/ajax/getProgramInfo', video_id,
query={'contentId': video_id},
headers={'Accept': 'application/json'})
series_id = program_info['seriesId']
if self._yes_playlist(series_id, video_id, smuggled_data):
playlist_data = self._download_json(
'https://www.litv.tv/vod/ajax/getSeriesTree', video_id,
query={'seriesId': series_id}, headers={'Accept': 'application/json'})
return self._extract_playlist(playlist_data, program_info['contentType'])
video_data = self._parse_json(self._search_regex(
r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
webpage, 'video data', default='{}'), video_id)
if not video_data:
payload = {'assetId': program_info['assetId']}
puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value)
if puid:
payload.update({
'type': 'auth',
'puid': puid,
})
endpoint = 'getUrl'
else:
payload.update({
'watchDevices': program_info['watchDevices'],
'contentType': program_info['contentType'],
})
endpoint = 'getMainUrlNoAuth'
video_data = self._download_json(
f'https://www.litv.tv/vod/ajax/{endpoint}', video_id,
data=json.dumps(payload).encode(),
headers={'Content-Type': 'application/json'})
if not video_data.get('fullpath'):
error_msg = video_data.get('errorMessage')
if error_msg == 'vod.error.outsideregionerror':
if error := traverse_obj(video_data, ('error', {dict})):
error_msg = traverse_obj(error, ('message', {str}))
if error_msg and 'OutsideRegionError' in error_msg:
self.raise_geo_restricted('This video is available in Taiwan only')
if error_msg:
elif error_msg:
raise ExtractorError(f'{self.IE_NAME} said: {error_msg}', expected=True)
raise ExtractorError(f'Unexpected result from {self.IE_NAME}')
raise ExtractorError(f'Unexpected error from {self.IE_NAME}')
formats = self._extract_m3u8_formats(
video_data['fullpath'], video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
video_data['result']['AssetURLs'][0], video_id, ext='mp4', m3u8_id='hls')
for a_format in formats:
# LiTV HLS segments doesn't like compressions
a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity'
title = program_info['title'] + program_info.get('secondaryMark', '')
description = program_info.get('description')
thumbnail = program_info.get('imageFile')
categories = [item['name'] for item in program_info.get('category', [])]
episode = int_or_none(program_info.get('episode'))
return {
'id': video_id,
'formats': formats,
'title': title,
'description': description,
'thumbnail': thumbnail,
'categories': categories,
'episode_number': episode,
'title': join_nonempty('title', 'secondary_mark', delim='', from_dict=program_info),
**traverse_obj(program_info, {
'description': ('description', {str}),
'thumbnail': ('picture', {urljoin('https://p-cdnstatic.svc.litv.tv/')}),
'categories': ('genres', ..., 'name', {str}),
'episode_number': ('episode', {int_or_none}),
}),
}

View File

@@ -114,7 +114,7 @@ class LSMLREmbedIE(InfoExtractor):
def _real_extract(self, url):
query = parse_qs(url)
video_id = traverse_obj(query, (
('show', 'id'), 0, {int_or_none}, {lambda x: x or None}, {str_or_none}), get_all=False)
('show', 'id'), 0, {int_or_none}, filter, {str_or_none}), get_all=False)
webpage = self._download_webpage(url, video_id)
player_data, media_data = self._search_regex(

View File

@@ -57,6 +57,6 @@ class MagentaMusikIE(InfoExtractor):
'duration': ('runtimeInSeconds', {int_or_none}),
'location': ('countriesOfProduction', {list}, {lambda x: join_nonempty(*x, delim=', ')}),
'release_year': ('yearOfProduction', {int_or_none}),
'categories': ('mainGenre', {str}, {lambda x: x and [x]}),
'categories': ('mainGenre', {str}, all, filter),
})),
}

View File

@@ -17,7 +17,7 @@ class MediaStreamBaseIE(InfoExtractor):
_BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)'
def _extract_mediastream_urls(self, webpage):
yield from traverse_obj(list(self._yield_json_ld(webpage, None, fatal=False)), (
yield from traverse_obj(list(self._yield_json_ld(webpage, None, default={})), (
lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'),
{lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None}))

View File

@@ -26,6 +26,7 @@ class MicrosoftEmbedIE(InfoExtractor):
'timestamp': 1631658316,
'upload_date': '20210914',
},
'expected_warnings': ['Failed to parse XML: syntax error: line 1, column 0'],
}]
_API_URL = 'https://prod-video-cms-rt-microsoft-com.akamaized.net/vhs/api/videos/'
@@ -36,11 +37,11 @@ class MicrosoftEmbedIE(InfoExtractor):
formats = []
for source_type, source in metadata['streams'].items():
if source_type == 'smooth_Streaming':
formats.extend(self._extract_ism_formats(source['url'], video_id, 'mss'))
formats.extend(self._extract_ism_formats(source['url'], video_id, 'mss', fatal=False))
elif source_type == 'apple_HTTP_Live_Streaming':
formats.extend(self._extract_m3u8_formats(source['url'], video_id, 'mp4'))
formats.extend(self._extract_m3u8_formats(source['url'], video_id, 'mp4', fatal=False))
elif source_type == 'mPEG_DASH':
formats.extend(self._extract_mpd_formats(source['url'], video_id))
formats.extend(self._extract_mpd_formats(source['url'], video_id, fatal=False))
else:
formats.append({
'format_id': source_type,

View File

@@ -1,291 +0,0 @@
import functools
import json
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
determine_ext,
dict_get,
float_or_none,
traverse_obj,
)
class MildomBaseIE(InfoExtractor):
_GUEST_ID = None
def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None):
if not self._GUEST_ID:
self._GUEST_ID = f'pc-gp-{uuid.uuid4()}'
content = self._download_json(
url, video_id, note=note, data=json.dumps(body).encode() if body else None,
headers={'Content-Type': 'application/json'} if body else {},
query={
'__guest_id': self._GUEST_ID,
'__platform': 'web',
**(query or {}),
})
if content['code'] != 0:
raise ExtractorError(
f'Mildom says: {content["message"]} (code {content["code"]})',
expected=True)
return content['body']
class MildomIE(MildomBaseIE):
IE_NAME = 'mildom'
IE_DESC = 'Record ongoing live by specific user in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/(?P<id>\d+)'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id)
enterstudio = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id,
note='Downloading live metadata', query={'user_id': video_id})
result_video_id = enterstudio.get('log_id', video_id)
servers = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id,
note='Downloading live server list', query={
'user_id': video_id,
'live_server_type': 'hls',
})
playback_token = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id,
note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'})
playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False)
if not playback_token:
raise ExtractorError('Failed to obtain live playback token')
formats = self._extract_m3u8_formats(
f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}',
result_video_id, 'mp4', headers={
'Referer': 'https://www.mildom.com/',
'Origin': 'https://www.mildom.com',
})
for fmt in formats:
fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/'
return {
'id': result_video_id,
'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'),
'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str),
'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000),
'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'),
'uploader_id': video_id,
'formats': formats,
'is_live': True,
}
class MildomVodIE(MildomBaseIE):
IE_NAME = 'mildom:vod'
IE_DESC = 'VOD in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)'
_TESTS = [{
'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269',
'info_dict': {
'id': '10882672-1597662269',
'ext': 'mp4',
'title': '始めてのミルダム配信じゃぃ!',
'thumbnail': r're:^https?://.*\.(png|jpg)$',
'upload_date': '20200817',
'duration': 4138.37,
'description': 'ゲームをしたくて!',
'timestamp': 1597662269.0,
'uploader_id': '10882672',
'uploader': 'kson組長(けいそん)',
},
}, {
'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477',
'info_dict': {
'id': '10882672-1597758589870-477',
'ext': 'mp4',
'title': '【kson】感染メイズ麻酔銃で無双する',
'thumbnail': r're:^https?://.*\.(png|jpg)$',
'timestamp': 1597759093.0,
'uploader': 'kson組長(けいそん)',
'duration': 4302.58,
'uploader_id': '10882672',
'description': 'このステージ絶対乗り越えたい',
'upload_date': '20200818',
},
}, {
'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0',
'info_dict': {
'id': '10882672-buha9td2lrn97fk2jme0',
'ext': 'mp4',
'title': '【kson組長】CART RACER!!!',
'thumbnail': r're:^https?://.*\.(png|jpg)$',
'uploader_id': '10882672',
'uploader': 'kson組長(けいそん)',
'upload_date': '20201104',
'timestamp': 1604494797.0,
'duration': 4657.25,
'description': 'WTF',
},
}]
def _real_extract(self, url):
user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id)
autoplay = self._call_api(
'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id,
note='Downloading playback metadata', query={
'v_id': video_id,
})['playback']
formats = [{
'url': autoplay['audio_url'],
'format_id': 'audio',
'protocol': 'm3u8_native',
'vcodec': 'none',
'acodec': 'aac',
'ext': 'm4a',
}]
for fmt in autoplay['video_link']:
formats.append({
'format_id': 'video-{}'.format(fmt['name']),
'url': fmt['url'],
'protocol': 'm3u8_native',
'width': fmt['level'] * autoplay['video_width'] // autoplay['video_height'],
'height': fmt['level'],
'vcodec': 'h264',
'acodec': 'aac',
'ext': 'mp4',
})
return {
'id': video_id,
'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'),
'description': traverse_obj(autoplay, 'video_intro'),
'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000),
'duration': float_or_none(autoplay.get('video_length'), scale=1000),
'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')),
'uploader': traverse_obj(autoplay, ('author_info', 'login_name')),
'uploader_id': user_id,
'formats': formats,
}
class MildomClipIE(MildomBaseIE):
IE_NAME = 'mildom:clip'
IE_DESC = 'Clip in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P<id>(?P<user_id>\d+)-[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9',
'info_dict': {
'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9',
'title': '全然違ったよ',
'timestamp': 1619181890,
'duration': 59,
'thumbnail': r're:https?://.+',
'uploader': 'ざきんぽ',
'uploader_id': '10042245',
},
}, {
'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864',
'info_dict': {
'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864',
'title': 'かっこいい',
'timestamp': 1621094003,
'duration': 59,
'thumbnail': r're:https?://.+',
'uploader': '(ルーキー',
'uploader_id': '10111524',
},
}, {
'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
'info_dict': {
'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
'title': '',
'timestamp': 1614769431,
'duration': 31,
'thumbnail': r're:https?://.+',
'uploader': 'ドルゴルスレンギーン=ダグワドルジ',
'uploader_id': '10660174',
},
}]
def _real_extract(self, url):
user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id)
clip_detail = self._call_api(
'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id,
note='Downloading playback metadata', query={
'clip_id': video_id,
})
return {
'id': video_id,
'title': self._html_search_meta(
('og:description', 'description'), webpage, default=None) or clip_detail.get('title'),
'timestamp': float_or_none(clip_detail.get('create_time')),
'duration': float_or_none(clip_detail.get('length')),
'thumbnail': clip_detail.get('cover'),
'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')),
'uploader_id': user_id,
'url': clip_detail['url'],
'ext': determine_ext(clip_detail.get('url'), 'mp4'),
}
class MildomUserVodIE(MildomBaseIE):
IE_NAME = 'mildom:user:vod'
IE_DESC = 'Download all VODs from specific user in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/profile/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.mildom.com/profile/10093333',
'info_dict': {
'id': '10093333',
'title': 'Uploads from ねこばたけ',
},
'playlist_mincount': 732,
}, {
'url': 'https://www.mildom.com/profile/10882672',
'info_dict': {
'id': '10882672',
'title': 'Uploads from kson組長(けいそん)',
},
'playlist_mincount': 201,
}]
def _fetch_page(self, user_id, page):
page += 1
reply = self._call_api(
'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList',
user_id, note=f'Downloading page {page}', query={
'user_id': user_id,
'page': page,
'limit': '30',
})
if not reply:
return
for x in reply:
v_id = x.get('v_id')
if not v_id:
continue
yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}')
def _real_extract(self, url):
user_id = self._match_id(url)
self.to_screen(f'This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/{user_id}" instead')
profile = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id,
query={'user_id': user_id}, note='Downloading user profile')['user_info']
return self.playlist_result(
OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30),
user_id, f'Uploads from {profile["loginname"]}')

View File

@@ -1,14 +1,13 @@
from .telecinco import TelecincoIE
from .telecinco import TelecincoBaseIE
from ..utils import (
int_or_none,
parse_iso8601,
)
class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE
class MiTeleIE(TelecincoBaseIE):
IE_DESC = 'mitele.es'
_VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player'
_TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player',
'info_dict': {
@@ -27,6 +26,7 @@ class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE
'timestamp': 1471209401,
'upload_date': '20160814',
},
'skip': 'HTTP Error 404 Not Found',
}, {
# no explicit title
'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player',
@@ -49,6 +49,26 @@ class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE
'params': {
'skip_download': True,
},
'skip': 'HTTP Error 404 Not Found',
}, {
'url': 'https://www.mitele.es/programas-tv/horizonte/temporada-5/programa-171-40_013480051/player/',
'info_dict': {
'id': '7adbe22e-cd41-4787-afa4-36f3da7c2c6f',
'ext': 'mp4',
'title': 'Horizonte Temporada 5 Programa 171',
'description': 'md5:97f1fb712c5ac27e5693a8b3c5c0c6e3',
'episode': 'Las Zonas de Bajas Emisiones, a debate',
'episode_number': 171,
'season': 'Season 5',
'season_number': 5,
'series': 'Horizonte',
'duration': 7012,
'upload_date': '20240927',
'timestamp': 1727416450,
'thumbnail': 'https://album.mediaset.es/eimg/2024/09/27/horizonte-171_9f02.jpg',
'age_limit': 12,
},
'params': {'geo_bypass_country': 'ES'},
}, {
'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player',
'only_matching': True,
@@ -60,9 +80,9 @@ class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
pre_player = self._parse_json(self._search_regex(
r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})',
webpage, 'Pre Player'), display_id)['prePlayer']
pre_player = self._search_json(
r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=',
webpage, 'Pre Player', display_id)['prePlayer']
title = pre_player['title']
video_info = self._parse_content(pre_player['video'], url)
content = pre_player.get('content') or {}

View File

@@ -12,7 +12,7 @@ from ..utils.traversal import traverse_obj
class MixchIE(InfoExtractor):
IE_NAME = 'mixch'
_VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)'
_VALID_URL = r'https?://mixch\.tv/u/(?P<id>\d+)'
_TESTS = [{
'url': 'https://mixch.tv/u/16943797/live',
@@ -66,7 +66,7 @@ class MixchIE(InfoExtractor):
note='Downloading comments', errnote='Failed to download comments'), (..., {
'author': ('name', {str}),
'author_id': ('user_id', {str_or_none}),
'id': ('message_id', {str}, {lambda x: x or None}),
'id': ('message_id', {str}, filter),
'text': ('body', {str}),
'timestamp': ('created', {int}),
}))
@@ -74,7 +74,7 @@ class MixchIE(InfoExtractor):
class MixchArchiveIE(InfoExtractor):
IE_NAME = 'mixch:archive'
_VALID_URL = r'https?://(?:www\.)?mixch\.tv/archive/(?P<id>\d+)'
_VALID_URL = r'https?://mixch\.tv/archive/(?P<id>\d+)'
_TESTS = [{
'url': 'https://mixch.tv/archive/421',
@@ -116,3 +116,56 @@ class MixchArchiveIE(InfoExtractor):
'formats': self._extract_m3u8_formats(info_json['archiveURL'], video_id),
'thumbnail': traverse_obj(info_json, ('thumbnailURL', {url_or_none})),
}
class MixchMovieIE(InfoExtractor):
IE_NAME = 'mixch:movie'
_VALID_URL = r'https?://mixch\.tv/m/(?P<id>\w+)'
_TESTS = [{
'url': 'https://mixch.tv/m/Ve8KNkJ5',
'info_dict': {
'id': 'Ve8KNkJ5',
'title': '夏☀️\nムービーへのポイントは本イベントに加算されないので配信にてお願い致します🙇🏻\u200d♀️\n#TGCCAMPUS #ミス東大 #ミス東大2024 ',
'ext': 'mp4',
'uploader': 'ミス東大No.5 松藤百香🍑💫',
'uploader_id': '12299174',
'channel_follower_count': int,
'view_count': int,
'like_count': int,
'comment_count': int,
'timestamp': 1724070828,
'uploader_url': 'https://mixch.tv/u/12299174',
'live_status': 'not_live',
'upload_date': '20240819',
},
}, {
'url': 'https://mixch.tv/m/61DzpIKE',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
f'https://mixch.tv/api-web/movies/{video_id}', video_id)
return {
'id': video_id,
'formats': [{
'format_id': 'mp4',
'url': data['movie']['file'],
'ext': 'mp4',
}],
**traverse_obj(data, {
'title': ('movie', 'title', {str}),
'thumbnail': ('movie', 'thumbnailURL', {url_or_none}),
'uploader': ('ownerInfo', 'name', {str}),
'uploader_id': ('ownerInfo', 'id', {int}, {str_or_none}),
'channel_follower_count': ('ownerInfo', 'fan', {int_or_none}),
'view_count': ('ownerInfo', 'view', {int_or_none}),
'like_count': ('movie', 'favCount', {int_or_none}),
'comment_count': ('movie', 'commentCount', {int_or_none}),
'timestamp': ('movie', 'published', {int_or_none}),
'uploader_url': ('ownerInfo', 'id', {lambda x: x and f'https://mixch.tv/u/{x}'}, filter),
}),
'live_status': 'not_live',
}

View File

@@ -4,15 +4,11 @@ from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_class,
get_element_text_and_html_by_tag,
int_or_none,
strip_or_none,
traverse_obj,
try_call,
unified_strdate,
)
from ..utils.traversal import find_element, traverse_obj
class MonstercatIE(InfoExtractor):
@@ -26,19 +22,21 @@ class MonstercatIE(InfoExtractor):
'thumbnail': 'https://www.monstercat.com/release/742779548009/cover',
'release_date': '20230711',
'album': 'The Secret Language of Trees',
'album_artist': 'BT',
'album_artists': ['BT'],
},
}]
def _extract_tracks(self, table, album_meta):
for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag
title = clean_html(try_call(
lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0]))
ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '')
title = traverse_obj(td, (
{find_element(cls='d-inline-flex flex-column')},
{lambda x: x.partition(' <span')}, 0, {clean_html}))
ids = traverse_obj(td, (
{find_element(cls='btn-play cursor-pointer mr-small', html=True)}, {extract_attributes})) or {}
track_id = ids.get('data-track-id')
release_id = ids.get('data-release-id')
track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td)))
track_number = traverse_obj(td, ({find_element(cls='py-xsmall')}, {int_or_none}))
if not track_id or not release_id:
self.report_warning(f'Skipping track {track_number}, ID(s) not found')
self.write_debug(f'release_id={release_id!r} track_id={track_id!r}')
@@ -48,7 +46,7 @@ class MonstercatIE(InfoExtractor):
'title': title,
'track': title,
'track_number': track_number,
'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))),
'artists': traverse_obj(td, ({find_element(cls='d-block fs-xxsmall')}, {clean_html}, all)),
'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}',
'id': track_id,
'ext': 'mp3',
@@ -57,20 +55,19 @@ class MonstercatIE(InfoExtractor):
def _real_extract(self, url):
url_id = self._match_id(url)
html = self._download_webpage(url, url_id)
# wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html
tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or ''
title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0])
date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block',
html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate}))
# NB: HTMLParser may choke on this html; use {find_element} or try_call(lambda: get_element...)
tracklist_table = traverse_obj(html, {find_element(cls='table table-small')}) or ''
title = traverse_obj(html, ({find_element(tag='h1')}, {clean_html}))
album_meta = {
'title': title,
'album': title,
'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover',
'album_artist': try_call(
lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)),
'release_date': date,
'album_artists': traverse_obj(html, (
{find_element(cls='h-normal text-uppercase mb-desktop-medium mb-smallish')}, {clean_html}, all)),
'release_date': traverse_obj(html, (
{find_element(cls='font-italic mb-medium d-tablet-none d-phone-block')},
{lambda x: x.partition('Released ')}, 2, {strip_or_none}, {unified_strdate})),
}
return self.playlist_result(

View File

@@ -86,7 +86,7 @@ class NebulaBaseIE(InfoExtractor):
def _extract_video_metadata(self, episode):
channel_url = traverse_obj(
episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False)
episode, (('channel_slug', 'class_slug'), {urljoin('https://nebula.tv/')}), get_all=False)
return {
'id': episode['id'].partition(':')[2],
**traverse_obj(episode, {

View File

@@ -6,12 +6,10 @@ from ..utils import (
determine_ext,
extract_attributes,
get_element_by_class,
get_element_text_and_html_by_tag,
parse_duration,
traverse_obj,
try_call,
url_or_none,
)
from ..utils.traversal import find_element, traverse_obj
class NekoHackerIE(InfoExtractor):
@@ -35,7 +33,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3',
'release_date': '20221101',
'album': 'Nekoverse',
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'track': 'Spaceship',
'track_number': 1,
'duration': 195.0,
@@ -53,7 +51,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3',
'release_date': '20221101',
'album': 'Nekoverse',
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'track': 'City Runner',
'track_number': 2,
'duration': 148.0,
@@ -71,7 +69,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3',
'release_date': '20221101',
'album': 'Nekoverse',
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'track': 'Nature Talk',
'track_number': 3,
'duration': 174.0,
@@ -89,7 +87,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3',
'release_date': '20221101',
'album': 'Nekoverse',
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'track': 'Crystal World',
'track_number': 4,
'duration': 199.0,
@@ -115,7 +113,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3',
'release_date': '20210115',
'album': '進め!むじなカンパニー',
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0',
'track_number': 1,
},
@@ -132,7 +130,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3',
'release_date': '20210115',
'album': '進め!むじなカンパニー',
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )',
'track_number': 2,
},
@@ -149,7 +147,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3',
'release_date': '20210115',
'album': '進め!むじなカンパニー',
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'track': '進め!むじなカンパニー (instrumental)',
'track_number': 3,
},
@@ -166,7 +164,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3',
'release_date': '20210115',
'album': '進め!むじなカンパニー',
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'track': 'むじな de なじむ (instrumental)',
'track_number': 4,
},
@@ -181,14 +179,17 @@ class NekoHackerIE(InfoExtractor):
playlist = get_element_by_class('playlist', webpage)
if not playlist:
iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or ''
iframe_src = url_or_none(extract_attributes(iframe).get('src'))
iframe_src = traverse_obj(webpage, (
{find_element(tag='iframe', html=True)}, {extract_attributes}, 'src', {url_or_none}))
if not iframe_src:
raise ExtractorError('No playlist or embed found in webpage')
elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src):
raise ExtractorError('Spotify embeds are not supported', expected=True)
return self.url_result(url, 'Generic')
player_params = self._search_json(
r'var srp_player_params_[\da-f]+\s*=', webpage, 'player params', playlist_id, default={})
entries = []
for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1):
entry = traverse_obj(extract_attributes(track), {
@@ -200,12 +201,12 @@ class NekoHackerIE(InfoExtractor):
'album': 'data-albumtitle',
'duration': ('data-tracktime', {parse_duration}),
'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0),
'thumbnail': ('data-albumart', {url_or_none}),
})
entries.append({
**entry,
'thumbnail': url_or_none(player_params.get('artwork')),
'track_number': track_number,
'artist': 'Neko Hacker',
'artists': ['Neko Hacker'],
'vcodec': 'none',
'acodec': 'mp3' if entry['ext'] == 'mp3' else None,
})

View File

@@ -36,10 +36,6 @@ class NetEaseMusicBaseIE(InfoExtractor):
_API_BASE = 'http://music.163.com/api/'
_GEO_BYPASS = False
@staticmethod
def _kilo_or_none(value):
return int_or_none(value, scale=1000)
def _create_eapi_cipher(self, api_path, query_body, cookies):
request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':'))
@@ -101,7 +97,7 @@ class NetEaseMusicBaseIE(InfoExtractor):
'vcodec': 'none',
**traverse_obj(song, {
'ext': ('type', {str}),
'abr': ('br', {self._kilo_or_none}),
'abr': ('br', {int_or_none(scale=1000)}),
'filesize': ('size', {int_or_none}),
}),
})
@@ -282,9 +278,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
**lyric_data,
**traverse_obj(info, {
'title': ('name', {str}),
'timestamp': ('album', 'publishTime', {self._kilo_or_none}),
'timestamp': ('album', 'publishTime', {int_or_none(scale=1000)}),
'thumbnail': ('album', 'picUrl', {url_or_none}),
'duration': ('duration', {self._kilo_or_none}),
'duration': ('duration', {int_or_none(scale=1000)}),
'album': ('album', 'name', {str}),
'average_rating': ('score', {int_or_none}),
}),
@@ -440,7 +436,7 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE):
'tags': ('tags', ..., {str}),
'uploader': ('creator', 'nickname', {str}),
'uploader_id': ('creator', 'userId', {str_or_none}),
'timestamp': ('updateTime', {self._kilo_or_none}),
'timestamp': ('updateTime', {int_or_none(scale=1000)}),
}))
if traverse_obj(info, ('playlist', 'specialType')) == 10:
metainfo['title'] = f'{metainfo.get("title")} {strftime_or_none(metainfo.get("timestamp"), "%Y-%m-%d")}'
@@ -517,10 +513,10 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE):
'creators': traverse_obj(info, ('artists', ..., 'name')) or [info.get('artistName')],
**traverse_obj(info, {
'title': ('name', {str}),
'description': (('desc', 'briefDesc'), {str}, {lambda x: x or None}),
'description': (('desc', 'briefDesc'), {str}, filter),
'upload_date': ('publishTime', {unified_strdate}),
'thumbnail': ('cover', {url_or_none}),
'duration': ('duration', {self._kilo_or_none}),
'duration': ('duration', {int_or_none(scale=1000)}),
'view_count': ('playCount', {int_or_none}),
'like_count': ('likeCount', {int_or_none}),
'comment_count': ('commentCount', {int_or_none}),
@@ -588,7 +584,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
'description': ('description', {str}),
'creator': ('dj', 'brand', {str}),
'thumbnail': ('coverUrl', {url_or_none}),
'timestamp': ('createTime', {self._kilo_or_none}),
'timestamp': ('createTime', {int_or_none(scale=1000)}),
})
if not self._yes_playlist(
@@ -598,7 +594,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
return {
'id': str(info['mainSong']['id']),
'formats': formats,
'duration': traverse_obj(info, ('mainSong', 'duration', {self._kilo_or_none})),
'duration': traverse_obj(info, ('mainSong', 'duration', {int_or_none(scale=1000)})),
**metainfo,
}

View File

@@ -371,7 +371,7 @@ class NexxIE(InfoExtractor):
# not all videos work via arc, e.g. nexx:741:1269984
if not video:
# Reverse engineered from JS code (see getDeviceID function)
device_id = f'{random.randint(1, 4)}:{int(time.time())}:{random.randint(1e4, 99999)}{random.randint(1, 9)}'
device_id = f'{random.randint(1, 4)}:{int(time.time())}:{random.randint(10000, 99999)}{random.randint(1, 9)}'
result = self._call_api(domain_id, 'session/init', video_id, data={
'nxp_devh': device_id,

View File

@@ -11,9 +11,12 @@ from ..utils import (
clean_html,
determine_ext,
get_element_by_class,
traverse_obj,
int_or_none,
make_archive_id,
url_or_none,
urlencode_postdata,
)
from ..utils.traversal import traverse_obj
class NFLBaseIE(InfoExtractor):
@@ -75,22 +78,15 @@ class NFLBaseIE(InfoExtractor):
'osVersion': '10.0',
}, separators=(',', ':')).encode()).decode(),
'networkType': 'other',
'nflClaimGroupsToAdd': [],
'nflClaimGroupsToRemove': [],
'peacockUUID': 'undefined',
}
_ACCOUNT_INFO = {}
_API_KEY = None
_API_KEY = '3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f'
_TOKEN = None
_TOKEN_EXPIRY = 0
def _get_account_info(self, url, slug):
if not self._API_KEY:
webpage = self._download_webpage(url, slug, fatal=False) or ''
self._API_KEY = self._search_regex(
r'window\.gigyaApiKey\s*=\s*["\'](\w+)["\'];', webpage, 'API key',
fatal=False) or '3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f'
def _get_account_info(self):
cookies = self._get_cookies('https://auth-id.nfl.com/')
login_token = traverse_obj(cookies, (
(f'glt_{self._API_KEY}', lambda k, _: k.startswith('glt_')), {lambda x: x.value}), get_all=False)
@@ -103,7 +99,7 @@ class NFLBaseIE(InfoExtractor):
'or else try using --cookies-from-browser instead', expected=True)
account = self._download_json(
'https://auth-id.nfl.com/accounts.getAccountInfo', slug,
'https://auth-id.nfl.com/accounts.getAccountInfo', None,
note='Downloading account info', data=urlencode_postdata({
'include': 'profile,data',
'lang': 'en',
@@ -111,7 +107,7 @@ class NFLBaseIE(InfoExtractor):
'sdk': 'js_latest',
'login_token': login_token,
'authMode': 'cookie',
'pageURL': url,
'pageURL': 'https://www.nfl.com/',
'sdkBuild': traverse_obj(cookies, (
'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='15170'),
'format': 'json',
@@ -126,55 +122,78 @@ class NFLBaseIE(InfoExtractor):
if len(self._ACCOUNT_INFO) != 3:
raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True)
def _get_auth_token(self, url, slug):
def _get_auth_token(self):
if self._TOKEN and self._TOKEN_EXPIRY > int(time.time() + 30):
return
if not self._ACCOUNT_INFO:
self._get_account_info(url, slug)
token = self._download_json(
'https://api.nfl.com/identity/v3/token%s' % (
'/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''),
slug, headers={'Content-Type': 'application/json'}, note='Downloading access token',
None, headers={'Content-Type': 'application/json'}, note='Downloading access token',
data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode())
self._TOKEN = token['accessToken']
self._TOKEN_EXPIRY = token['expiresIn']
self._ACCOUNT_INFO['refreshToken'] = token['refreshToken']
def _extract_video(self, mcp_id, is_live=False):
self._get_auth_token()
data = self._download_json(
f'https://api.nfl.com/play/v1/asset/{mcp_id}', mcp_id, headers={
'Authorization': f'Bearer {self._TOKEN}',
'Accept': 'application/json',
'Content-Type': 'application/json',
}, data=json.dumps({'init': True, 'live': is_live}, separators=(',', ':')).encode())
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
data['accessUrl'], mcp_id, 'mp4', m3u8_id='hls')
return {
'id': mcp_id,
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
'_old_archive_ids': [make_archive_id(AnvatoIE, mcp_id)],
**traverse_obj(data, ('metadata', {
'title': ('event', ('def_title', 'friendlyName'), {str}, any),
'description': ('event', 'def_description', {str}),
'duration': ('event', 'duration', {int_or_none}),
'thumbnails': ('thumbnails', ..., 'url', {'url': {url_or_none}}),
})),
}
def _parse_video_config(self, video_config, display_id):
video_config = self._parse_json(video_config, display_id)
is_live = traverse_obj(video_config, ('live', {bool})) or False
item = video_config['playlist'][0]
mcp_id = item.get('mcpID')
if mcp_id:
info = self.url_result(f'{self._ANVATO_PREFIX}{mcp_id}', AnvatoIE, mcp_id)
if mcp_id := item.get('mcpID'):
return self._extract_video(mcp_id, is_live=is_live)
info = {'id': item.get('id') or item['entityId']}
item_url = item['url']
ext = determine_ext(item_url)
if ext == 'm3u8':
info['formats'] = self._extract_m3u8_formats(item_url, info['id'], 'mp4')
else:
media_id = item.get('id') or item['entityId']
title = item.get('title')
item_url = item['url']
info = {'id': media_id}
ext = determine_ext(item_url)
if ext == 'm3u8':
info['formats'] = self._extract_m3u8_formats(item_url, media_id, 'mp4')
else:
info['url'] = item_url
if item.get('audio') is True:
info['vcodec'] = 'none'
is_live = video_config.get('live') is True
thumbnails = None
image_url = item.get(item.get('imageSrc')) or item.get(item.get('posterImage'))
if image_url:
thumbnails = [{
'url': image_url,
'ext': determine_ext(image_url, 'jpg'),
}]
info.update({
'title': title,
'is_live': is_live,
'description': clean_html(item.get('description')),
'thumbnails': thumbnails,
})
info['url'] = item_url
if item.get('audio') is True:
info['vcodec'] = 'none'
thumbnails = None
if image_url := traverse_obj(item, 'imageSrc', 'posterImage', expected_type=url_or_none):
thumbnails = [{
'url': image_url,
'ext': determine_ext(image_url, 'jpg'),
}]
info.update({
**traverse_obj(item, {
'title': ('title', {str}),
'description': ('description', {clean_html}),
}),
'is_live': is_live,
'thumbnails': thumbnails,
})
return info
@@ -188,24 +207,20 @@ class NFLIE(NFLBaseIE):
'ext': 'mp4',
'title': "Baker Mayfield's game-changing plays from 3-TD game Week 14",
'description': 'md5:85e05a3cc163f8c344340f220521136d',
'upload_date': '20201215',
'timestamp': 1608009755,
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'NFL',
'tags': 'count:6',
'thumbnail': r're:https?://.+\.jpg',
'duration': 157,
'categories': 'count:3',
'_old_archive_ids': ['anvato 899441'],
},
}, {
'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown',
'md5': '6886b32c24b463038c760ceb55a34566',
'md5': '92a517f05bd3eb50fe50244bc621aec8',
'info_dict': {
'id': 'd87e8790-3e14-11eb-8ceb-ff05c2867f99',
'id': '8b7c3625-a461-4751-8db4-85f536f2bbd0',
'ext': 'mp3',
'title': 'Patrick Mahomes, Travis Kelce React to Win Over Dolphins | The Breakdown',
'description': 'md5:12ada8ee70e6762658c30e223e095075',
'thumbnail': 'https://static.clubs.nfl.com/image/private/t_editorial_landscape_12_desktop/v1571153441/chiefs/rfljejccnyhhkpkfq855',
},
'skip': 'HTTP Error 404: Not Found',
}, {
'url': 'https://www.buffalobills.com/video/buffalo-bills-military-recognition-week-14',
'only_matching': True,
@@ -236,13 +251,16 @@ class NFLArticleIE(NFLBaseIE):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
entries = []
for video_config in re.findall(self._VIDEO_CONFIG_REGEX, webpage):
entries.append(self._parse_video_config(video_config, display_id))
def entries():
for video_config in re.findall(self._VIDEO_CONFIG_REGEX, webpage):
yield self._parse_video_config(video_config, display_id)
title = clean_html(get_element_by_class(
'nfl-c-article__title', webpage)) or self._html_search_meta(
['og:title', 'twitter:title'], webpage)
return self.playlist_result(entries, display_id, title)
return self.playlist_result(entries(), display_id, title)
class NFLPlusReplayIE(NFLBaseIE):
@@ -307,6 +325,9 @@ class NFLPlusReplayIE(NFLBaseIE):
'all_22': 'All-22',
}
def _real_initialize(self):
self._get_account_info()
def _real_extract(self, url):
slug, video_id = self._match_valid_url(url).group('slug', 'id')
requested_types = self._configuration_arg('type', ['all'])
@@ -315,7 +336,7 @@ class NFLPlusReplayIE(NFLBaseIE):
requested_types = traverse_obj(self._REPLAY_TYPES, (None, requested_types))
if not video_id:
self._get_auth_token(url, slug)
self._get_auth_token()
headers = {'Authorization': f'Bearer {self._TOKEN}'}
game_id = self._download_json(
f'https://api.nfl.com/football/v2/games/externalId/slug/{slug}', slug,
@@ -328,14 +349,13 @@ class NFLPlusReplayIE(NFLBaseIE):
'items', lambda _, v: v['subType'] == requested_types[0], 'mcpPlaybackId'), get_all=False)
if video_id:
return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id)
return self._extract_video(video_id)
def entries():
for replay in traverse_obj(
replays, ('items', lambda _, v: v['mcpPlaybackId'] and v['subType'] in requested_types),
):
video_id = replay['mcpPlaybackId']
yield self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id)
yield self._extract_video(replay['mcpPlaybackId'])
return self.playlist_result(entries(), slug)
@@ -362,12 +382,15 @@ class NFLPlusEpisodeIE(NFLBaseIE):
'params': {'skip_download': 'm3u8'},
}]
def _real_initialize(self):
self._get_account_info()
def _real_extract(self, url):
slug = self._match_id(url)
self._get_auth_token(url, slug)
self._get_auth_token()
video_id = self._download_json(
f'https://api.nfl.com/content/v1/videos/episodes/{slug}', slug, headers={
'Authorization': f'Bearer {self._TOKEN}',
})['mcpPlaybackId']
return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id)
return self._extract_video(video_id)

View File

@@ -371,11 +371,11 @@ class NiconicoIE(InfoExtractor):
'acodec': 'aac',
'vcodec': 'h264',
**traverse_obj(audio_quality, ('metadata', {
'abr': ('bitrate', {functools.partial(float_or_none, scale=1000)}),
'abr': ('bitrate', {float_or_none(scale=1000)}),
'asr': ('samplingRate', {int_or_none}),
})),
**traverse_obj(video_quality, ('metadata', {
'vbr': ('bitrate', {functools.partial(float_or_none, scale=1000)}),
'vbr': ('bitrate', {float_or_none(scale=1000)}),
'height': ('resolution', 'height', {int_or_none}),
'width': ('resolution', 'width', {int_or_none}),
})),
@@ -428,7 +428,7 @@ class NiconicoIE(InfoExtractor):
**audio_fmt,
**traverse_obj(audios, (lambda _, v: audio_fmt['format_id'].startswith(v['id']), {
'format_id': ('id', {str}),
'abr': ('bitRate', {functools.partial(float_or_none, scale=1000)}),
'abr': ('bitRate', {float_or_none(scale=1000)}),
'asr': ('samplingRate', {int_or_none}),
}), get_all=False),
'acodec': 'aac',
@@ -869,7 +869,7 @@ class NicovideoTagURLIE(NicovideoSearchBaseIE):
class NiconicoUserIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])'
_VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)(?:/video)?/?(?:$|[#?])'
_TEST = {
'url': 'https://www.nicovideo.jp/user/419948',
'info_dict': {
@@ -877,7 +877,7 @@ class NiconicoUserIE(InfoExtractor):
},
'playlist_mincount': 101,
}
_API_URL = 'https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s'
_API_URL = 'https://nvapi.nicovideo.jp/v2/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s'
_PAGE_SIZE = 100
_API_HEADERS = {
@@ -897,12 +897,13 @@ class NiconicoUserIE(InfoExtractor):
total_count = int_or_none(json_parsed['data'].get('totalCount'))
for entry in json_parsed['data']['items']:
count += 1
yield self.url_result('https://www.nicovideo.jp/watch/{}'.format(entry['id']))
yield self.url_result(
f'https://www.nicovideo.jp/watch/{entry["essential"]["id"]}', ie=NiconicoIE)
page_num += 1
def _real_extract(self, url):
list_id = self._match_id(url)
return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key())
return self.playlist_result(self._entries(list_id), list_id)
class NiconicoLiveIE(InfoExtractor):

View File

@@ -10,10 +10,10 @@ from ..utils import (
get_element_html_by_class,
get_elements_by_class,
int_or_none,
try_call,
unified_timestamp,
urlencode_postdata,
)
from ..utils.traversal import find_element, find_elements, traverse_obj
class NubilesPornIE(InfoExtractor):
@@ -70,9 +70,8 @@ class NubilesPornIE(InfoExtractor):
url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0]
channel_id, channel_name = self._search_regex(
r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page),
r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page) or '',
'channel', fatal=False, group=('id', 'name')) or (None, None)
channel_name = re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name)
return {
'id': video_id,
@@ -82,14 +81,14 @@ class NubilesPornIE(InfoExtractor):
'thumbnail': media_entries.get('thumbnail'),
'description': clean_html(get_element_html_by_class('content-pane-description', page)),
'timestamp': unified_timestamp(get_element_by_class('date', page)),
'channel': channel_name,
'channel': re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name) if channel_name else None,
'channel_id': channel_id,
'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'),
'like_count': int_or_none(get_element_by_id('likecount', page)),
'average_rating': float_or_none(get_element_by_class('score', page)),
'age_limit': 18,
'categories': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_element_by_class('categories', page))))),
'tags': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_elements_by_class('tags', page)[1])))),
'categories': traverse_obj(page, ({find_element(cls='categories')}, {find_elements(cls='btn')}, ..., {clean_html})),
'tags': traverse_obj(page, ({find_elements(cls='tags')}, 1, {find_elements(cls='btn')}, ..., {clean_html})),
'cast': get_elements_by_class('content-pane-performer', page),
'availability': 'needs_auth',
'series': channel_name,

View File

@@ -235,7 +235,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
details = traverse_obj(block, {
'id': ('sourceId', {str}),
'uploader': ('bylines', ..., 'renderedRepresentation', {str}),
'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))),
'duration': (None, (('duration', {float_or_none(scale=1000)}), ('length', {int_or_none}))),
'timestamp': ('firstPublished', {parse_iso8601}),
'series': ('podcastSeries', {str}),
}, get_all=False)
@@ -343,7 +343,7 @@ class NYTimesCookingIE(NYTimesBaseIE):
if media_ids:
media_ids.append(lead_video_id)
return self.playlist_result(
[self._extract_video(media_id) for media_id in media_ids], page_id, title, description)
map(self._extract_video, media_ids), page_id, title, description)
return {
**self._extract_video(lead_video_id),

View File

@@ -115,7 +115,7 @@ class OnDemandKoreaIE(InfoExtractor):
**traverse_obj(data, {
'thumbnail': ('episode', 'images', 'thumbnail', {url_or_none}),
'release_date': ('episode', 'release_date', {lambda x: x.replace('-', '')}, {unified_strdate}),
'duration': ('duration', {functools.partial(float_or_none, scale=1000)}),
'duration': ('duration', {float_or_none(scale=1000)}),
'age_limit': ('age_rating', 'name', {lambda x: x.replace('R', '')}, {parse_age_limit}),
'series': ('episode', {if_series(key='program')}, 'title'),
'series_id': ('episode', {if_series(key='program')}, 'id', {str_or_none}),

View File

@@ -1,5 +1,4 @@
import base64
import functools
import re
from .common import InfoExtractor
@@ -192,7 +191,7 @@ class ORFPodcastIE(InfoExtractor):
'ext': ('enclosures', 0, 'type', {mimetype2ext}),
'title': 'title',
'description': ('description', {clean_html}),
'duration': ('duration', {functools.partial(float_or_none, scale=1000)}),
'duration': ('duration', {float_or_none(scale=1000)}),
'series': ('podcast', 'title'),
})),
}
@@ -494,7 +493,7 @@ class ORFONIE(InfoExtractor):
return traverse_obj(api_json, {
'id': ('id', {int}, {str_or_none}),
'age_limit': ('age_classification', {parse_age_limit}),
'duration': ('exact_duration', {functools.partial(float_or_none, scale=1000)}),
'duration': ('exact_duration', {float_or_none(scale=1000)}),
'title': (('title', 'headline'), {str}),
'description': (('description', 'teaser_text'), {str}),
'media_type': ('video_type', {str}),

View File

@@ -1,5 +1,3 @@
import functools
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
@@ -83,7 +81,7 @@ class ParlerIE(InfoExtractor):
'timestamp': ('date_created', {unified_timestamp}),
'uploader': ('user', 'name', {strip_or_none}),
'uploader_id': ('user', 'username', {str}),
'uploader_url': ('user', 'username', {functools.partial(urljoin, 'https://parler.com/')}),
'uploader_url': ('user', 'username', {urljoin('https://parler.com/')}),
'view_count': ('views', {int_or_none}),
'comment_count': ('total_comments', {int_or_none}),
'repost_count': ('echos', {int_or_none}),

View File

@@ -16,10 +16,10 @@ from ..utils import (
parse_iso8601,
smuggle_url,
str_or_none,
traverse_obj,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj, value
class PatreonBaseIE(InfoExtractor):
@@ -252,6 +252,27 @@ class PatreonIE(PatreonBaseIE):
'thumbnail': r're:^https?://.+',
},
'skip': 'Patron-only content',
}, {
# Contains a comment reply in the 'included' section
'url': 'https://www.patreon.com/posts/114721679',
'info_dict': {
'id': '114721679',
'ext': 'mp4',
'upload_date': '20241025',
'uploader': 'Japanalysis',
'like_count': int,
'thumbnail': r're:^https?://.+',
'comment_count': int,
'title': 'Karasawa Part 2',
'description': 'Part 2 of this video https://www.youtube.com/watch?v=Azms2-VTASk',
'uploader_url': 'https://www.patreon.com/japanalysis',
'uploader_id': '80504268',
'channel_url': 'https://www.patreon.com/japanalysis',
'channel_follower_count': int,
'timestamp': 1729897015,
'channel_id': '9346307',
},
'params': {'getcomments': True},
}]
_RETURN_TYPE = 'video'
@@ -404,26 +425,24 @@ class PatreonIE(PatreonBaseIE):
f'posts/{post_id}/comments', post_id, query=params, note=f'Downloading comments page {page}')
cursor = None
for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...)):
for comment in traverse_obj(response, (('data', 'included'), lambda _, v: v['type'] == 'comment' and v['id'])):
count += 1
comment_id = comment.get('id')
attributes = comment.get('attributes') or {}
if comment_id is None:
continue
author_id = traverse_obj(comment, ('relationships', 'commenter', 'data', 'id'))
author_info = traverse_obj(
response, ('included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes'),
get_all=False, expected_type=dict, default={})
yield {
'id': comment_id,
'text': attributes.get('body'),
'timestamp': parse_iso8601(attributes.get('created')),
'parent': traverse_obj(comment, ('relationships', 'parent', 'data', 'id'), default='root'),
'author_is_uploader': attributes.get('is_by_creator'),
**traverse_obj(comment, {
'id': ('id', {str_or_none}),
'text': ('attributes', 'body', {str}),
'timestamp': ('attributes', 'created', {parse_iso8601}),
'parent': ('relationships', 'parent', 'data', ('id', {value('root')}), {str}, any),
'author_is_uploader': ('attributes', 'is_by_creator', {bool}),
}),
**traverse_obj(response, (
'included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes', {
'author': ('full_name', {str}),
'author_thumbnail': ('image_url', {url_or_none}),
}), get_all=False),
'author_id': author_id,
'author': author_info.get('full_name'),
'author_thumbnail': author_info.get('image_url'),
}
if count < traverse_obj(response, ('meta', 'count')):
@@ -438,7 +457,7 @@ class PatreonCampaignIE(PatreonBaseIE):
_VALID_URL = r'''(?x)
https?://(?:www\.)?patreon\.com/(?:
(?:m|api/campaigns)/(?P<campaign_id>\d+)|
(?P<vanity>(?!creation[?/]|posts/|rss[?/])[\w-]+)
(?:c/)?(?P<vanity>(?!creation[?/]|posts/|rss[?/])[\w-]+)
)(?:/posts)?/?(?:$|[?#])'''
_TESTS = [{
'url': 'https://www.patreon.com/dissonancepod/',
@@ -490,6 +509,26 @@ class PatreonCampaignIE(PatreonBaseIE):
'thumbnail': r're:^https?://.*$',
},
'playlist_mincount': 201,
}, {
'url': 'https://www.patreon.com/c/OgSog',
'info_dict': {
'id': '8504388',
'title': 'OGSoG',
'description': r're:(?s)Hello and welcome to our Patreon page. We are Mari, Lasercorn, .+',
'channel': 'OGSoG',
'channel_id': '8504388',
'channel_url': 'https://www.patreon.com/OgSog',
'uploader_url': 'https://www.patreon.com/OgSog',
'uploader_id': '72323575',
'uploader': 'David Moss',
'thumbnail': r're:https?://.+/.+',
'channel_follower_count': int,
'age_limit': 0,
},
'playlist_mincount': 331,
}, {
'url': 'https://www.patreon.com/c/OgSog/posts',
'only_matching': True,
}, {
'url': 'https://www.patreon.com/dissonancepod/posts',
'only_matching': True,

122
yt_dlp/extractor/pialive.py Normal file
View File

@@ -0,0 +1,122 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_class,
multipart_encode,
str_or_none,
unified_timestamp,
url_or_none,
)
from ..utils.traversal import traverse_obj
class PiaLiveIE(InfoExtractor):
_VALID_URL = r'https?://player\.pia-live\.jp/stream/(?P<id>[\w-]+)'
_PLAYER_ROOT_URL = 'https://player.pia-live.jp/'
_PIA_LIVE_API_URL = 'https://api.pia-live.jp'
_API_KEY = 'kfds)FKFps-dms9e'
_TESTS = [{
'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krUDqGOwN4d61dCWQYOd6CTxl4hjya9dsfEZGsM4uGOUdax60lEI4twsXGXf7crmz8Gk__GhupTrWxA7RFRVt76',
'info_dict': {
'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84',
'display_id': '2431867_001',
'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)',
'live_status': 'was_live',
'comment_count': int,
},
'params': {
'getcomments': True,
'skip_download': True,
'ignore_no_formats_error': True,
},
'skip': 'The video is no longer available',
}, {
'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krJdu0GVBVbVy01IwpJ6J3qBEm3d9TCTt1d0eWpsZGj7DrOjVOmS7GAWGwyscMgiThopJvzgWC4H5b-7XQjAfRZ',
'info_dict': {
'id': '9ce8b8ba-f6d1-4d1f-83a0-18c3148ded93',
'display_id': '2431867_002',
'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)',
'live_status': 'was_live',
'comment_count': int,
},
'params': {
'getcomments': True,
'skip_download': True,
'ignore_no_formats_error': True,
},
'skip': 'The video is no longer available',
}]
def _extract_var(self, variable, html):
return self._search_regex(
rf'(?:var|const|let)\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
html, f'variable {variable}', group='value')
def _real_extract(self, url):
video_key = self._match_id(url)
webpage = self._download_webpage(url, video_key)
program_code = self._extract_var('programCode', webpage)
article_code = self._extract_var('articleCode', webpage)
title = self._html_extract_title(webpage)
if get_element_html_by_class('play-end', webpage):
raise ExtractorError('The video is no longer available', expected=True, video_id=program_code)
if start_info := clean_html(get_element_by_class('play-waiting__date', webpage)):
date, time = self._search_regex(
r'(?P<date>\d{4}/\d{1,2}/\d{1,2})\([月火水木金土日]\)(?P<time>\d{2}:\d{2})',
start_info, 'start_info', fatal=False, group=('date', 'time'))
if date and time:
release_timestamp_str = f'{date} {time} +09:00'
release_timestamp = unified_timestamp(release_timestamp_str)
self.raise_no_formats(f'The video will be available after {release_timestamp_str}', expected=True)
return {
'id': program_code,
'title': title,
'live_status': 'is_upcoming',
'release_timestamp': release_timestamp,
}
payload, content_type = multipart_encode({
'play_url': video_key,
'api_key': self._API_KEY,
})
api_data_and_headers = {
'data': payload,
'headers': {'Content-Type': content_type, 'Referer': self._PLAYER_ROOT_URL},
}
player_tag_list = self._download_json(
f'{self._PIA_LIVE_API_URL}/perf/player-tag-list/{program_code}', program_code,
'Fetching player tag list', 'Unable to fetch player tag list', **api_data_and_headers)
return self.url_result(
extract_attributes(player_tag_list['data']['movie_one_tag'])['src'],
url_transparent=True, title=title, display_id=program_code,
__post_extractor=self.extract_comments(program_code, article_code, api_data_and_headers))
def _get_comments(self, program_code, article_code, api_data_and_headers):
chat_room_url = traverse_obj(self._download_json(
f'{self._PIA_LIVE_API_URL}/perf/chat-tag-list/{program_code}/{article_code}', program_code,
'Fetching chat info', 'Unable to fetch chat info', fatal=False, **api_data_and_headers),
('data', 'chat_one_tag', {extract_attributes}, 'src', {url_or_none}))
if not chat_room_url:
return
comment_page = self._download_webpage(
chat_room_url, program_code, 'Fetching comment page', 'Unable to fetch comment page',
fatal=False, headers={'Referer': self._PLAYER_ROOT_URL})
if not comment_page:
return
yield from traverse_obj(self._search_json(
r'var\s+_history\s*=', comment_page, 'comment list',
program_code, contains_pattern=r'\[(?s:.+)\]', fatal=False), (..., {
'timestamp': (0, {int}),
'author_is_uploader': (1, {lambda x: x == 2}),
'author': (2, {str}),
'text': (3, {str}),
'id': (4, {str_or_none}),
}))

View File

@@ -1,70 +0,0 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
parse_qs,
time_seconds,
traverse_obj,
)
class PIAULIZAPortalIE(InfoExtractor):
IE_DESC = 'ulizaportal.jp - PIA LIVE STREAM'
_VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TESTS = [{
'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44',
'info_dict': {
'id': '005f18b7-e810-5618-cb82-0987c5755d44',
'title': 'プレゼンテーションプレイヤーのサンプル',
'live_status': 'not_live',
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}, {
'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1',
'info_dict': {
'id': '005e1b23-fe93-5780-19a0-98e917cc4b7d',
'title': '【確認用】視聴サンプルページULIZA',
'live_status': 'not_live',
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0)))
if expires and expires <= time_seconds():
raise ExtractorError('The link is expired.', video_id=video_id, expected=True)
webpage = self._download_webpage(url, video_id)
player_data = self._download_webpage(
self._search_regex(
r'<script [^>]*\bsrc="(https://player-api\.p\.uliza\.jp/v1/players/[^"]+)"',
webpage, 'player data url'),
video_id, headers={'Referer': 'https://ulizaportal.jp/'},
note='Fetching player data', errnote='Unable to fetch player data')
formats = self._extract_m3u8_formats(
self._search_regex(
r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data,
'm3u8 url', default=None),
video_id, fatal=False)
m3u8_type = self._search_regex(
r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None)
return {
'id': video_id,
'title': self._html_extract_title(webpage),
'formats': formats,
'live_status': {
'video': 'is_live',
'dvr': 'was_live', # short-term archives
}.get(m3u8_type, 'not_live'), # VOD or long-term archives
}

View File

@@ -1,4 +1,5 @@
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
traverse_obj,
@@ -110,8 +111,8 @@ class PixivSketchUserIE(PixivSketchBaseIE):
if not traverse_obj(data, 'is_broadcasting'):
try:
self._call_api(user_id, 'users/current.json', url, 'Investigating reason for request failure')
except ExtractorError as ex:
if ex.cause and ex.cause.code == 401:
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
self.raise_login_required(f'Please log in, or use direct link like https://sketch.pixiv.net/@{user_id}/1234567890', method='cookies')
raise ExtractorError('This user is offline', expected=True)

130
yt_dlp/extractor/plvideo.py Normal file
View File

@@ -0,0 +1,130 @@
from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
parse_iso8601,
parse_resolution,
url_or_none,
)
from ..utils.traversal import traverse_obj
class PlVideoIE(InfoExtractor):
IE_DESC = 'Платформа'
_VALID_URL = r'https?://(?:www\.)?plvideo\.ru/(?:watch\?(?:[^#]+&)?v=|shorts/)(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://plvideo.ru/watch?v=Y5JzUzkcQTMK',
'md5': 'fe8e18aca892b3b31f3bf492169f8a26',
'info_dict': {
'id': 'Y5JzUzkcQTMK',
'ext': 'mp4',
'thumbnail': 'https://img.plvideo.ru/images/fp-2024-images/v/cover/37/dd/37dd00a4c96c77436ab737e85947abd7/original663a4a3bb713e5.33151959.jpg',
'title': 'Presidente de Cuba llega a Moscú en una visita de trabajo',
'channel': 'RT en Español',
'channel_id': 'ZH4EKqunVDvo',
'media_type': 'video',
'comment_count': int,
'tags': ['rusia', 'cuba', 'russia', 'miguel díaz-canel'],
'description': 'md5:a1a395d900d77a86542a91ee0826c115',
'released_timestamp': 1715096124,
'channel_is_verified': True,
'like_count': int,
'timestamp': 1715095911,
'duration': 44320,
'view_count': int,
'dislike_count': int,
'upload_date': '20240507',
'modified_date': '20240701',
'channel_follower_count': int,
'modified_timestamp': 1719824073,
},
}, {
'url': 'https://plvideo.ru/shorts/S3Uo9c-VLwFX',
'md5': '7d8fa2279406c69d2fd2a6fc548a9805',
'info_dict': {
'id': 'S3Uo9c-VLwFX',
'ext': 'mp4',
'channel': 'Romaatom',
'tags': 'count:22',
'dislike_count': int,
'upload_date': '20241130',
'description': 'md5:452e6de219bf2f32bb95806c51c3b364',
'duration': 58433,
'modified_date': '20241130',
'thumbnail': 'https://img.plvideo.ru/images/fp-2024-11-cover/S3Uo9c-VLwFX/f9318999-a941-482b-b700-2102a7049366.jpg',
'media_type': 'shorts',
'like_count': int,
'modified_timestamp': 1732961458,
'channel_is_verified': True,
'channel_id': 'erJyyTIbmUd1',
'timestamp': 1732961355,
'comment_count': int,
'title': 'Белоусов отменил приказы о кадровом резерве на гражданской службе',
'channel_follower_count': int,
'view_count': int,
'released_timestamp': 1732961458,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18', video_id)
is_live = False
formats = []
subtitles = {}
automatic_captions = {}
for quality, data in traverse_obj(video_data, ('item', 'profiles', {dict.items}, lambda _, v: url_or_none(v[1]['hls']))):
formats.append({
'format_id': quality,
'ext': 'mp4',
'protocol': 'm3u8_native',
**traverse_obj(data, {
'url': 'hls',
'fps': ('fps', {float_or_none}),
'aspect_ratio': ('aspectRatio', {float_or_none}),
}),
**parse_resolution(quality),
})
if livestream_url := traverse_obj(video_data, ('item', 'livestream', 'url', {url_or_none})):
is_live = True
formats.extend(self._extract_m3u8_formats(livestream_url, video_id, 'mp4', live=True))
for lang, url in traverse_obj(video_data, ('item', 'subtitles', {dict.items}, lambda _, v: url_or_none(v[1]))):
if lang.endswith('-auto'):
automatic_captions.setdefault(lang[:-5], []).append({
'url': url,
})
else:
subtitles.setdefault(lang, []).append({
'url': url,
})
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'automatic_captions': automatic_captions,
'is_live': is_live,
**traverse_obj(video_data, ('item', {
'id': ('id', {str}),
'title': ('title', {str}),
'description': ('description', {str}),
'thumbnail': ('cover', 'paths', 'original', 'src', {url_or_none}),
'duration': ('uploadFile', 'videoDuration', {int_or_none}),
'channel': ('channel', 'name', {str}),
'channel_id': ('channel', 'id', {str}),
'channel_follower_count': ('channel', 'stats', 'subscribers', {int_or_none}),
'channel_is_verified': ('channel', 'verified', {bool}),
'tags': ('tags', ..., {str}),
'timestamp': ('createdAt', {parse_iso8601}),
'released_timestamp': ('publishedAt', {parse_iso8601}),
'modified_timestamp': ('updatedAt', {parse_iso8601}),
'view_count': ('stats', 'viewTotalCount', {int_or_none}),
'like_count': ('stats', 'likeCount', {int_or_none}),
'dislike_count': ('stats', 'dislikeCount', {int_or_none}),
'comment_count': ('stats', 'commentCount', {int_or_none}),
'media_type': ('type', {str}),
})),
}

View File

@@ -1,136 +0,0 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
extract_attributes,
int_or_none,
js_to_json,
merge_dicts,
)
class PokemonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))'
_TESTS = [{
'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/',
'md5': '2fe8eaec69768b25ef898cda9c43062e',
'info_dict': {
'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4',
'ext': 'mp4',
'title': 'The Ol Raise and Switch!',
'description': 'md5:7db77f7107f98ba88401d3adc80ff7af',
},
'add_id': ['LimelightMedia'],
}, {
# no data-video-title
'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008',
'info_dict': {
'id': 'dfbaf830d7e54e179837c50c0c6cc0e1',
'ext': 'mp4',
'title': "Pokémon : L'ascension de Darkrai",
'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5',
},
'add_id': ['LimelightMedia'],
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2',
'only_matching': True,
}, {
'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/',
'only_matching': True,
}, {
'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id or display_id)
video_data = extract_attributes(self._search_regex(
r'(<[^>]+data-video-id="{}"[^>]*>)'.format(video_id if video_id else '[a-z0-9]{32}'),
webpage, 'video data element'))
video_id = video_data['data-video-id']
title = video_data.get('data-video-title') or self._html_search_meta(
'pkm-title', webpage, ' title', default=None) or self._search_regex(
r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title')
return {
'_type': 'url_transparent',
'id': video_id,
'url': f'limelight:media:{video_id}',
'title': title,
'description': video_data.get('data-video-summary'),
'thumbnail': video_data.get('data-video-poster'),
'series': 'Pokémon',
'season_number': int_or_none(video_data.get('data-video-season')),
'episode': title,
'episode_number': int_or_none(video_data.get('data-video-episode')),
'ie_key': 'LimelightMedia',
}
class PokemonWatchIE(InfoExtractor):
_VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/(?:#/)?player(?:\.html)?\?id=(?P<id>[a-z0-9]{32})'
_API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}'
_TESTS = [{
'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667',
'md5': '62833938a31e61ab49ada92f524c42ff',
'info_dict': {
'id': '8309a40969894a8e8d5bc1311e9c5667',
'ext': 'mp4',
'title': 'Lillier and the Staff!',
'description': 'md5:338841b8c21b283d24bdc9b568849f04',
},
}, {
'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2',
'only_matching': True,
}, {
'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07',
'only_matching': True,
}]
def _extract_media(self, channel_array, video_id):
for channel in channel_array:
for media in channel.get('media'):
if media.get('id') == video_id:
return media
return None
def _real_extract(self, url):
video_id = self._match_id(url)
info = {
'_type': 'url',
'id': video_id,
'url': f'limelight:media:{video_id}',
'ie_key': 'LimelightMedia',
}
# API call can be avoided entirely if we are listing formats
if self.get_param('listformats', False):
return info
webpage = self._download_webpage(url, video_id)
build_vars = self._parse_json(self._search_regex(
r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'),
video_id, transform_source=js_to_json)
region = build_vars.get('region')
channel_array = self._download_json(self._API_URL.format(region), video_id)
video_data = self._extract_media(channel_array, video_id)
if video_data is None:
raise ExtractorError(
f'Video {video_id} does not exist', expected=True)
info['_type'] = 'url_transparent'
images = video_data.get('images')
return merge_dicts(info, {
'title': video_data.get('title'),
'description': video_data.get('description'),
'thumbnail': images.get('medium') or images.get('small'),
'series': 'Pokémon',
'season_number': int_or_none(video_data.get('season')),
'episode': video_data.get('title'),
'episode_number': int_or_none(video_data.get('episode')),
})

View File

@@ -1,5 +1,5 @@
from .common import InfoExtractor
from ..compat import functools
from ..utils import (
int_or_none,
parse_duration,
@@ -104,7 +104,7 @@ class PornboxIE(InfoExtractor):
get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k'])
metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], {
'url': 'src',
'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
'vbr': ('bitrate', {int_or_none(scale=1000)}),
'format_id': ('quality', {str_or_none}),
'quality': ('quality', {get_quality}),
'width': ('size', {lambda x: int(x[:-1])}),

View File

@@ -198,6 +198,6 @@ class Pr0grammIE(InfoExtractor):
'dislike_count': ('down', {int}),
'timestamp': ('created', {int}),
'upload_date': ('created', {int}, {dt.date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}),
'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)}),
'thumbnail': ('thumb', {urljoin('https://thumb.pr0gramm.com')}),
}),
}

View File

@@ -140,7 +140,7 @@ class QDanceIE(InfoExtractor):
'description': ('description', {str.strip}),
'display_id': ('slug', {str}),
'thumbnail': ('thumbnail', {url_or_none}),
'duration': ('durationInSeconds', {int_or_none}, {lambda x: x or None}),
'duration': ('durationInSeconds', {int_or_none}, filter),
'availability': ('subscription', 'level', {extract_availability}),
'is_live': ('type', {lambda x: x.lower() == 'live'}),
'artist': ('acts', ..., {str}),

View File

@@ -211,10 +211,10 @@ class QQMusicIE(QQMusicBaseIE):
'formats': formats,
**traverse_obj(info_data, {
'title': ('title', {str}),
'album': ('album', 'title', {str}, {lambda x: x or None}),
'album': ('album', 'title', {str}, filter),
'release_date': ('time_public', {lambda x: x.replace('-', '') or None}),
'creators': ('singer', ..., 'name', {str}),
'alt_title': ('subtitle', {str}, {lambda x: x or None}),
'alt_title': ('subtitle', {str}, filter),
'duration': ('interval', {int_or_none}),
}),
**traverse_obj(init_data, ('detail', {

View File

@@ -0,0 +1,105 @@
from .common import InfoExtractor
from ..utils import url_or_none
from ..utils.traversal import traverse_obj
class RadioRadicaleIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radioradicale\.it/scheda/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://www.radioradicale.it/scheda/471591',
'md5': 'eb0fbe43a601f1a361cbd00f3c45af4a',
'info_dict': {
'id': '471591',
'ext': 'mp4',
'title': 'md5:e8fbb8de57011a3255db0beca69af73d',
'description': 'md5:5e15a789a2fe4d67da8d1366996e89ef',
'location': 'Napoli',
'duration': 2852.0,
'timestamp': 1459987200,
'upload_date': '20160407',
'thumbnail': 'https://www.radioradicale.it/photo400/0/0/9/0/1/00901768.jpg',
},
}, {
'url': 'https://www.radioradicale.it/scheda/742783/parlamento-riunito-in-seduta-comune-11a-della-xix-legislatura',
'info_dict': {
'id': '742783',
'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)',
'description': '-) Votazione per l\'elezione di un giudice della Corte Costituzionale (nono scrutinio)',
'location': 'CAMERA',
'duration': 5868.0,
'timestamp': 1730246400,
'upload_date': '20241030',
},
'playlist': [{
'md5': 'aa48de55dcc45478e4cd200f299aab7d',
'info_dict': {
'id': '742783-0',
'ext': 'mp4',
'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)',
},
}, {
'md5': 'be915c189c70ad2920e5810f32260ff5',
'info_dict': {
'id': '742783-1',
'ext': 'mp4',
'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)',
},
}, {
'md5': 'f0ee4047342baf8ed3128a8417ac5e0a',
'info_dict': {
'id': '742783-2',
'ext': 'mp4',
'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)',
},
}],
}]
def _entries(self, videos_info, page_id):
for idx, video in enumerate(traverse_obj(
videos_info, ('playlist', lambda _, v: v['sources']))):
video_id = f'{page_id}-{idx}'
formats = []
subtitles = {}
for m3u8_url in traverse_obj(video, ('sources', ..., 'src', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
for sub in traverse_obj(video, ('subtitles', ..., lambda _, v: url_or_none(v['src']))):
self._merge_subtitles({sub.get('srclang') or 'und': [{
'url': sub['src'],
'name': sub.get('label'),
}]}, target=subtitles)
yield {
'id': video_id,
'title': video.get('title'),
'formats': formats,
'subtitles': subtitles,
}
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
videos_info = self._search_json(
r'jQuery\.extend\(Drupal\.settings\s*,',
webpage, 'videos_info', page_id)['RRscheda']
entries = list(self._entries(videos_info, page_id))
common_info = {
'id': page_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'location': videos_info.get('luogo'),
**self._search_json_ld(webpage, page_id),
}
if len(entries) == 1:
return {
**entries[0],
**common_info,
}
return self.playlist_result(entries, multi_video=True, **common_info)

View File

@@ -259,6 +259,8 @@ class RedditIE(InfoExtractor):
f'https://www.reddit.com/{slug}/.json', video_id, expected_status=403)
except ExtractorError as e:
if isinstance(e.cause, json.JSONDecodeError):
if self._get_cookies('https://www.reddit.com/').get('reddit_session'):
raise ExtractorError('Your IP address is unable to access the Reddit API', expected=True)
self.raise_login_required('Account authentication is required')
raise

View File

@@ -1,4 +1,3 @@
import functools
from .common import InfoExtractor
from ..networking import HEADRequest
@@ -118,7 +117,7 @@ class RedCDNLivxIE(InfoExtractor):
time_scale = traverse_obj(ism_doc, ('@TimeScale', {int_or_none})) or 10000000
duration = traverse_obj(
ism_doc, ('@Duration', {functools.partial(float_or_none, scale=time_scale)})) or None
ism_doc, ('@Duration', {float_or_none(scale=time_scale)})) or None
live_status = None
if traverse_obj(ism_doc, '@IsLive') == 'TRUE':

View File

@@ -213,7 +213,7 @@ class RedGifsSearchIE(RedGifsBaseInfoExtractor):
class RedGifsUserIE(RedGifsBaseInfoExtractor):
IE_DESC = 'Redgifs user'
_VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P<username>[^/?#]+)(?:\?(?P<query>[^#]+))?'
_PAGE_SIZE = 30
_PAGE_SIZE = 80
_TESTS = [
{
'url': 'https://www.redgifs.com/users/lamsinka89',
@@ -222,7 +222,7 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor):
'title': 'lamsinka89',
'description': 'RedGifs user lamsinka89, ordered by recent',
},
'playlist_mincount': 100,
'playlist_mincount': 391,
},
{
'url': 'https://www.redgifs.com/users/lamsinka89?page=3',
@@ -231,7 +231,7 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor):
'title': 'lamsinka89',
'description': 'RedGifs user lamsinka89, ordered by recent',
},
'playlist_count': 30,
'playlist_count': 80,
},
{
'url': 'https://www.redgifs.com/users/lamsinka89?order=best&type=g',
@@ -240,7 +240,17 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor):
'title': 'lamsinka89',
'description': 'RedGifs user lamsinka89, ordered by best',
},
'playlist_mincount': 100,
'playlist_mincount': 391,
},
{
'url': 'https://www.redgifs.com/users/ignored52',
'note': 'https://github.com/yt-dlp/yt-dlp/issues/7382',
'info_dict': {
'id': 'ignored52',
'title': 'ignored52',
'description': 'RedGifs user ignored52, ordered by recent',
},
'playlist_mincount': 121,
},
]

View File

@@ -187,4 +187,4 @@ class RTVSLOShowIE(InfoExtractor):
return self.playlist_from_matches(
re.findall(r'<a [^>]*\bhref="(/arhiv/[^"]+)"', webpage),
playlist_id, self._html_extract_title(webpage),
getter=lambda x: urljoin('https://365.rtvslo.si', x), ie=RTVSLOIE)
getter=urljoin('https://365.rtvslo.si'), ie=RTVSLOIE)

View File

@@ -2,15 +2,21 @@ import itertools
from .common import InfoExtractor
from ..utils import (
UnsupportedError,
bool_or_none,
determine_ext,
int_or_none,
js_to_json,
parse_qs,
traverse_obj,
str_or_none,
try_get,
unified_timestamp,
url_or_none,
)
from ..utils.traversal import (
subs_list_to_dict,
traverse_obj,
)
class RutubeBaseIE(InfoExtractor):
@@ -19,7 +25,7 @@ class RutubeBaseIE(InfoExtractor):
query = {}
query['format'] = 'json'
return self._download_json(
f'http://rutube.ru/api/video/{video_id}/',
f'https://rutube.ru/api/video/{video_id}/',
video_id, 'Downloading video JSON',
'Unable to download video JSON', query=query)
@@ -61,18 +67,21 @@ class RutubeBaseIE(InfoExtractor):
query = {}
query['format'] = 'json'
return self._download_json(
f'http://rutube.ru/api/play/options/{video_id}/',
f'https://rutube.ru/api/play/options/{video_id}/',
video_id, 'Downloading options JSON',
'Unable to download options JSON',
headers=self.geo_verification_headers(), query=query)
def _extract_formats(self, options, video_id):
def _extract_formats_and_subtitles(self, options, video_id):
formats = []
subtitles = {}
for format_id, format_url in options['video_balancer'].items():
ext = determine_ext(format_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_id, fatal=False))
@@ -82,11 +91,19 @@ class RutubeBaseIE(InfoExtractor):
'format_id': format_id,
})
for hls_url in traverse_obj(options, ('live_streams', 'hls', ..., 'url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(hls_url, video_id, ext='mp4', fatal=False))
return formats
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls')
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
self._merge_subtitles(traverse_obj(options, ('captions', ..., {
'id': 'code',
'url': 'file',
'name': ('langTitle', {str}),
}, all, {subs_list_to_dict(lang='ru')})), target=subtitles)
return formats, subtitles
def _download_and_extract_formats(self, video_id, query=None):
return self._extract_formats(
def _download_and_extract_formats_and_subtitles(self, video_id, query=None):
return self._extract_formats_and_subtitles(
self._download_api_options(video_id, query=query), video_id)
@@ -97,8 +114,8 @@ class RutubeIE(RutubeBaseIE):
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1']
_TESTS = [{
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
'md5': 'e33ac625efca66aba86cbec9851f2692',
'url': 'https://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
'md5': '3d73fdfe5bb81b9aef139e22ef3de26a',
'info_dict': {
'id': '3eac3b4561676c17df9132a9a1e62e3e',
'ext': 'mp4',
@@ -111,26 +128,25 @@ class RutubeIE(RutubeBaseIE):
'upload_date': '20131016',
'age_limit': 0,
'view_count': int,
'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg',
'thumbnail': 'https://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg',
'categories': ['Новости и СМИ'],
'chapters': [],
},
'expected_warnings': ['Unable to download f4m'],
}, {
'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
'url': 'https://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
'only_matching': True,
}, {
'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',
'url': 'https://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',
'only_matching': True,
}, {
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252',
'url': 'https://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252',
'only_matching': True,
}, {
'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source',
'only_matching': True,
}, {
'url': 'https://rutube.ru/video/private/884fb55f07a97ab673c7d654553e0f48/?p=x2QojCumHTS3rsKHWXN8Lg',
'md5': 'd106225f15d625538fe22971158e896f',
'md5': '4fce7b4fcc7b1bcaa3f45eb1e1ad0dd7',
'info_dict': {
'id': '884fb55f07a97ab673c7d654553e0f48',
'ext': 'mp4',
@@ -143,11 +159,10 @@ class RutubeIE(RutubeBaseIE):
'upload_date': '20221210',
'age_limit': 0,
'view_count': int,
'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg',
'thumbnail': 'https://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg',
'categories': ['Видеоигры'],
'chapters': [],
},
'expected_warnings': ['Unable to download f4m'],
}, {
'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/',
'info_dict': {
@@ -156,17 +171,16 @@ class RutubeIE(RutubeBaseIE):
'chapters': 'count:4',
'categories': ['Бизнес и предпринимательство'],
'description': 'md5:252feac1305257d8c1bab215cedde75d',
'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png',
'thumbnail': 'https://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png',
'duration': 782,
'age_limit': 0,
'uploader_id': '23491359',
'timestamp': 1677153329,
'view_count': int,
'upload_date': '20230223',
'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании',
'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании #1',
'uploader': 'Стас Быков',
},
'expected_warnings': ['Unable to download f4m'],
}, {
'url': 'https://rutube.ru/live/video/c58f502c7bb34a8fcdd976b221fca292/',
'info_dict': {
@@ -174,7 +188,7 @@ class RutubeIE(RutubeBaseIE):
'ext': 'mp4',
'categories': ['Телепередачи'],
'description': '',
'thumbnail': 'http://pic.rutubelist.ru/video/14/19/14190807c0c48b40361aca93ad0867c7.jpg',
'thumbnail': 'https://pic.rutubelist.ru/video/14/19/14190807c0c48b40361aca93ad0867c7.jpg',
'live_status': 'is_live',
'age_limit': 0,
'uploader_id': '23460655',
@@ -184,6 +198,24 @@ class RutubeIE(RutubeBaseIE):
'title': r're:Первый канал. Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'uploader': 'Первый канал',
},
}, {
'url': 'https://rutube.ru/play/embed/03a9cb54bac3376af4c5cb0f18444e01/',
'info_dict': {
'id': '03a9cb54bac3376af4c5cb0f18444e01',
'ext': 'mp4',
'age_limit': 0,
'description': '',
'title': 'Церемония начала торгов акциями ПАО «ЕвроТранс»',
'chapters': [],
'upload_date': '20240829',
'duration': 293,
'uploader': 'MOEX - Московская биржа',
'timestamp': 1724946628,
'thumbnail': 'https://pic.rutubelist.ru/video/2e/24/2e241fddb459baf0fa54acfca44874f4.jpg',
'view_count': int,
'uploader_id': '38420507',
'categories': ['Интервью'],
},
}, {
'url': 'https://rutube.ru/video/5ab908fccfac5bb43ef2b1e4182256b0/',
'only_matching': True,
@@ -192,40 +224,46 @@ class RutubeIE(RutubeBaseIE):
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if RutubePlaylistIE.suitable(url) else super().suitable(url)
def _real_extract(self, url):
video_id = self._match_id(url)
query = parse_qs(url)
info = self._download_and_extract_info(video_id, query)
info['formats'] = self._download_and_extract_formats(video_id, query)
return info
formats, subtitles = self._download_and_extract_formats_and_subtitles(video_id, query)
return {
**info,
'formats': formats,
'subtitles': subtitles,
}
class RutubeEmbedIE(RutubeBaseIE):
IE_NAME = 'rutube:embed'
IE_DESC = 'Rutube embedded videos'
_VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)'
_VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)(?:[?#/]|$)'
_TESTS = [{
'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
'url': 'https://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
'info_dict': {
'id': 'a10e53b86e8f349080f718582ce4c661',
'ext': 'mp4',
'timestamp': 1387830582,
'upload_date': '20131223',
'uploader_id': '297833',
'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89',
'uploader': 'subziro89 ILya',
'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89',
'age_limit': 0,
'duration': 1395,
'chapters': [],
'description': 'md5:a5acea57bbc3ccdc3cacd1f11a014b5b',
'view_count': int,
'thumbnail': 'https://pic.rutubelist.ru/video/d3/03/d3031f4670a6e6170d88fb3607948418.jpg',
'categories': ['Сериалы'],
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://rutube.ru/play/embed/8083783',
'url': 'https://rutube.ru/play/embed/8083783',
'only_matching': True,
}, {
# private video
@@ -240,11 +278,12 @@ class RutubeEmbedIE(RutubeBaseIE):
query = parse_qs(url)
options = self._download_api_options(embed_id, query)
video_id = options['effective_video']
formats = self._extract_formats(options, video_id)
formats, subtitles = self._extract_formats_and_subtitles(options, video_id)
info = self._download_and_extract_info(video_id, query)
info.update({
'extractor_key': 'Rutube',
'formats': formats,
'subtitles': subtitles,
})
return info
@@ -295,14 +334,14 @@ class RutubeTagsIE(RutubePlaylistBaseIE):
IE_DESC = 'Rutube tags'
_VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)'
_TESTS = [{
'url': 'http://rutube.ru/tags/video/1800/',
'url': 'https://rutube.ru/tags/video/1800/',
'info_dict': {
'id': '1800',
},
'playlist_mincount': 68,
}]
_PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json'
_PAGE_TEMPLATE = 'https://rutube.ru/api/tags/video/%s/?page=%s&format=json'
class RutubeMovieIE(RutubePlaylistBaseIE):
@@ -310,8 +349,8 @@ class RutubeMovieIE(RutubePlaylistBaseIE):
IE_DESC = 'Rutube movies'
_VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)'
_MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json'
_PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json'
_MOVIE_TEMPLATE = 'https://rutube.ru/api/metainfo/tv/%s/?format=json'
_PAGE_TEMPLATE = 'https://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json'
def _real_extract(self, url):
movie_id = self._match_id(url)
@@ -327,62 +366,82 @@ class RutubePersonIE(RutubePlaylistBaseIE):
IE_DESC = 'Rutube person videos'
_VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)'
_TESTS = [{
'url': 'http://rutube.ru/video/person/313878/',
'url': 'https://rutube.ru/video/person/313878/',
'info_dict': {
'id': '313878',
},
'playlist_mincount': 37,
'playlist_mincount': 36,
}]
_PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json'
_PAGE_TEMPLATE = 'https://rutube.ru/api/video/person/%s/?page=%s&format=json'
class RutubePlaylistIE(RutubePlaylistBaseIE):
IE_NAME = 'rutube:playlist'
IE_DESC = 'Rutube playlists'
_VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)'
_VALID_URL = r'https?://rutube\.ru/plst/(?P<id>\d+)'
_TESTS = [{
'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag',
'url': 'https://rutube.ru/plst/308547/',
'info_dict': {
'id': '3097',
'id': '308547',
},
'playlist_count': 27,
}, {
'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source',
'only_matching': True,
'playlist_mincount': 22,
}]
_PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json'
@classmethod
def suitable(cls, url):
from ..utils import int_or_none, parse_qs
if not super().suitable(url):
return False
params = parse_qs(url)
return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0])
def _next_page_url(self, page_num, playlist_id, item_kind):
return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num)
def _real_extract(self, url):
qs = parse_qs(url)
playlist_kind = qs['pl_type'][0]
playlist_id = qs['pl_id'][0]
return self._extract_playlist(playlist_id, item_kind=playlist_kind)
_PAGE_TEMPLATE = 'https://rutube.ru/api/playlist/custom/%s/videos?page=%s&format=json'
class RutubeChannelIE(RutubePlaylistBaseIE):
IE_NAME = 'rutube:channel'
IE_DESC = 'Rutube channel'
_VALID_URL = r'https?://rutube\.ru/channel/(?P<id>\d+)/videos'
_VALID_URL = r'https?://rutube\.ru/(?:channel/(?P<id>\d+)|u/(?P<slug>\w+))(?:/(?P<section>videos|shorts|playlists))?'
_TESTS = [{
'url': 'https://rutube.ru/channel/639184/videos/',
'info_dict': {
'id': '639184',
'id': '639184_videos',
},
'playlist_mincount': 133,
'playlist_mincount': 129,
}, {
'url': 'https://rutube.ru/channel/25902603/shorts/',
'info_dict': {
'id': '25902603_shorts',
},
'playlist_mincount': 277,
}, {
'url': 'https://rutube.ru/channel/25902603/',
'info_dict': {
'id': '25902603',
},
'playlist_mincount': 406,
}, {
'url': 'https://rutube.ru/u/rutube/videos/',
'info_dict': {
'id': '23704195_videos',
},
'playlist_mincount': 113,
}]
_PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json'
_PAGE_TEMPLATE = 'https://rutube.ru/api/video/person/%s/?page=%s&format=json&origin__type=%s'
def _next_page_url(self, page_num, playlist_id, section):
origin_type = {
'videos': 'rtb,rst,ifrm,rspa',
'shorts': 'rshorts',
None: '',
}.get(section)
return self._PAGE_TEMPLATE % (playlist_id, page_num, origin_type)
def _real_extract(self, url):
playlist_id, slug, section = self._match_valid_url(url).group('id', 'slug', 'section')
if section == 'playlists':
raise UnsupportedError(url)
if slug:
webpage = self._download_webpage(url, slug)
redux_state = self._search_json(
r'window\.reduxState\s*=', webpage, 'redux state', slug, transform_source=js_to_json)
playlist_id = traverse_obj(redux_state, (
'api', 'queries', lambda k, _: k.startswith('channelIdBySlug'),
'data', 'channel_id', {int}, {str_or_none}, any))
playlist = self._extract_playlist(playlist_id, section=section)
if section:
playlist['id'] = f'{playlist_id}_{section}'
return playlist

View File

@@ -1,11 +1,9 @@
import base64
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt, unpad_pkcs7
from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..utils import (
ExtractorError,
bytes_to_intlist,
intlist_to_bytes,
unified_strdate,
)
@@ -68,10 +66,10 @@ class ShemarooMeIE(InfoExtractor):
data_json = self._download_json('https://www.shemaroome.com/users/user_all_lists', video_id, data=data.encode())
if not data_json.get('status'):
raise ExtractorError('Premium videos cannot be downloaded yet.', expected=True)
url_data = bytes_to_intlist(base64.b64decode(data_json['new_play_url']))
key = bytes_to_intlist(base64.b64decode(data_json['key']))
iv = [0] * 16
m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii')
url_data = base64.b64decode(data_json['new_play_url'])
key = base64.b64decode(data_json['key'])
iv = bytes(16)
m3u8_url = unpad_pkcs7(aes_cbc_decrypt_bytes(url_data, key, iv)).decode('ascii')
headers = {'stream_key': data_json['stream_key']}
formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers=headers)
for fmt in formats:

View File

@@ -56,13 +56,13 @@ class SnapchatSpotlightIE(InfoExtractor):
**traverse_obj(video_data, ('videoMetadata', {
'title': ('name', {str}),
'description': ('description', {str}),
'timestamp': ('uploadDateMs', {lambda x: float_or_none(x, 1000)}),
'timestamp': ('uploadDateMs', {float_or_none(scale=1000)}),
'view_count': ('viewCount', {int_or_none}, {lambda x: None if x == -1 else x}),
'repost_count': ('shareCount', {int_or_none}),
'url': ('contentUrl', {url_or_none}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'duration': ('durationMs', {lambda x: float_or_none(x, 1000)}),
'duration': ('durationMs', {float_or_none(scale=1000)}),
'thumbnail': ('thumbnailUrl', {url_or_none}),
'uploader': ('creator', 'personCreator', 'username', {str}),
'uploader_url': ('creator', 'personCreator', 'url', {url_or_none}),

View File

@@ -199,8 +199,9 @@ class SonyLIVSeriesIE(InfoExtractor):
},
}]
_API_BASE = 'https://apiv2.sonyliv.com/AGL'
_SORT_ORDERS = ('asc', 'desc')
def _entries(self, show_id):
def _entries(self, show_id, sort_order):
headers = {
'Accept': 'application/json, text/plain, */*',
'Referer': 'https://www.sonyliv.com',
@@ -215,6 +216,9 @@ class SonyLIVSeriesIE(InfoExtractor):
'from': '0',
'to': '49',
}), ('resultObj', 'containers', 0, 'containers', lambda _, v: int_or_none(v['id'])))
if sort_order == 'desc':
seasons = reversed(seasons)
for season in seasons:
season_id = str(season['id'])
note = traverse_obj(season, ('metadata', 'title', {str})) or 'season'
@@ -226,7 +230,7 @@ class SonyLIVSeriesIE(InfoExtractor):
'from': str(cursor),
'to': str(cursor + 99),
'orderBy': 'episodeNumber',
'sortOrder': 'asc',
'sortOrder': sort_order,
}), ('resultObj', 'containers', 0, 'containers', lambda _, v: int_or_none(v['id'])))
if not episodes:
break
@@ -237,4 +241,10 @@ class SonyLIVSeriesIE(InfoExtractor):
def _real_extract(self, url):
show_id = self._match_id(url)
return self.playlist_result(self._entries(show_id), playlist_id=show_id)
sort_order = self._configuration_arg('sort_order', [self._SORT_ORDERS[0]])[0]
if sort_order not in self._SORT_ORDERS:
raise ValueError(
f'Invalid sort order "{sort_order}". Allowed values are: {", ".join(self._SORT_ORDERS)}')
return self.playlist_result(self._entries(show_id, sort_order), playlist_id=show_id)

Some files were not shown because too many files have changed in this diff Show More