1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2026-01-09 16:31:17 +00:00

Merge branch 'master' into yt-live-from-start-range

This commit is contained in:
bashonly
2024-04-16 11:01:17 -05:00
committed by GitHub
86 changed files with 3058 additions and 1337 deletions

View File

@@ -150,6 +150,7 @@ from .arte import (
)
from .arnes import ArnesIE
from .asobichannel import AsobiChannelIE, AsobiChannelTagURLIE
from .asobistage import AsobiStageIE
from .atresplayer import AtresPlayerIE
from .atscaleconf import AtScaleConfEventIE
from .atvat import ATVAtIE
@@ -590,6 +591,7 @@ from .facebook import (
FacebookReelIE,
FacebookAdsIE,
)
from .fathom import FathomIE
from .fancode import (
FancodeVodIE,
FancodeLiveIE
@@ -874,6 +876,7 @@ from .jeuxvideo import JeuxVideoIE
from .jiosaavn import (
JioSaavnSongIE,
JioSaavnAlbumIE,
JioSaavnPlaylistIE,
)
from .jove import JoveIE
from .joj import JojIE
@@ -989,6 +992,10 @@ from .lnkgo import (
LnkGoIE,
LnkIE,
)
from .loom import (
LoomIE,
LoomFolderIE,
)
from .lovehomeporn import LoveHomePornIE
from .lrt import (
LRTVODIE,
@@ -1750,6 +1757,7 @@ from .shahid import (
ShahidIE,
ShahidShowIE,
)
from .sharepoint import SharePointIE
from .sharevideos import ShareVideosEmbedIE
from .sibnet import SibnetEmbedIE
from .shemaroome import ShemarooMeIE
@@ -2283,6 +2291,7 @@ from .vrt import (
VrtNUIE,
KetnetIE,
DagelijkseKostIE,
Radio1BeIE,
)
from .vtm import VTMIE
from .medialaan import MedialaanIE

View File

@@ -1,25 +1,65 @@
import functools
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
date_from_str,
UserNotLive,
determine_ext,
filter_dict,
int_or_none,
qualities,
traverse_obj,
unified_strdate,
orderedSet,
unified_timestamp,
update_url_query,
url_or_none,
urlencode_postdata,
xpath_text,
urljoin,
)
from ..utils.traversal import traverse_obj
class AfreecaTVIE(InfoExtractor):
class AfreecaTVBaseIE(InfoExtractor):
_NETRC_MACHINE = 'afreecatv'
def _perform_login(self, username, password):
login_form = {
'szWork': 'login',
'szType': 'json',
'szUid': username,
'szPassword': password,
'isSaveId': 'false',
'szScriptVar': 'oLoginRet',
'szAction': '',
}
response = self._download_json(
'https://login.afreecatv.com/app/LoginAction.php', None,
'Logging in', data=urlencode_postdata(login_form))
_ERRORS = {
-4: 'Your account has been suspended due to a violation of our terms and policies.',
-5: 'https://member.afreecatv.com/app/user_delete_progress.php',
-6: 'https://login.afreecatv.com/membership/changeMember.php',
-8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
-9: 'https://member.afreecatv.com/app/pop_login_block.php',
-11: 'https://login.afreecatv.com/afreeca/second_login.php',
-12: 'https://member.afreecatv.com/app/user_security.php',
0: 'The username does not exist or you have entered the wrong password.',
-1: 'The username does not exist or you have entered the wrong password.',
-3: 'You have entered your username/password incorrectly.',
-7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
-10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
-32008: 'You have failed to log in. Please contact our Help Center.',
}
result = int_or_none(response.get('RESULT'))
if result != 1:
error = _ERRORS.get(result, 'You have failed to log in.')
raise ExtractorError(
'Unable to login: %s said: %s' % (self.IE_NAME, error),
expected=True)
class AfreecaTVIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv'
IE_DESC = 'afreecatv.com'
_VALID_URL = r'''(?x)
@@ -34,7 +74,6 @@ class AfreecaTVIE(InfoExtractor):
)
(?P<id>\d+)
'''
_NETRC_MACHINE = 'afreecatv'
_TESTS = [{
'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
@@ -87,6 +126,7 @@ class AfreecaTVIE(InfoExtractor):
'uploader': '♥이슬이',
'uploader_id': 'dasl8121',
'upload_date': '20170411',
'timestamp': 1491929865,
'duration': 213,
},
'params': {
@@ -120,219 +160,102 @@ class AfreecaTVIE(InfoExtractor):
'uploader_id': 'rlantnghks',
'uploader': '페이즈으',
'duration': 10840,
'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r',
'thumbnail': r're:https?://videoimg\.afreecatv\.com/.+',
'upload_date': '20230108',
'timestamp': 1673218805,
'title': '젠지 페이즈',
},
'params': {
'skip_download': True,
},
}, {
# adult content
'url': 'https://vod.afreecatv.com/player/70395877',
'only_matching': True,
}, {
# subscribers only
'url': 'https://vod.afreecatv.com/player/104647403',
'only_matching': True,
}, {
# private
'url': 'https://vod.afreecatv.com/player/81669846',
'only_matching': True,
}]
@staticmethod
def parse_video_key(key):
video_key = {}
m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
if m:
video_key['upload_date'] = m.group('upload_date')
video_key['part'] = int(m.group('part'))
return video_key
def _perform_login(self, username, password):
login_form = {
'szWork': 'login',
'szType': 'json',
'szUid': username,
'szPassword': password,
'isSaveId': 'false',
'szScriptVar': 'oLoginRet',
'szAction': '',
}
response = self._download_json(
'https://login.afreecatv.com/app/LoginAction.php', None,
'Logging in', data=urlencode_postdata(login_form))
_ERRORS = {
-4: 'Your account has been suspended due to a violation of our terms and policies.',
-5: 'https://member.afreecatv.com/app/user_delete_progress.php',
-6: 'https://login.afreecatv.com/membership/changeMember.php',
-8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
-9: 'https://member.afreecatv.com/app/pop_login_block.php',
-11: 'https://login.afreecatv.com/afreeca/second_login.php',
-12: 'https://member.afreecatv.com/app/user_security.php',
0: 'The username does not exist or you have entered the wrong password.',
-1: 'The username does not exist or you have entered the wrong password.',
-3: 'You have entered your username/password incorrectly.',
-7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
-10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
-32008: 'You have failed to log in. Please contact our Help Center.',
}
result = int_or_none(response.get('RESULT'))
if result != 1:
error = _ERRORS.get(result, 'You have failed to log in.')
raise ExtractorError(
'Unable to login: %s said: %s' % (self.IE_NAME, error),
expected=True)
def _real_extract(self, url):
video_id = self._match_id(url)
partial_view = False
adult_view = False
for _ in range(2):
data = self._download_json(
'https://api.m.afreecatv.com/station/video/a/view',
video_id, headers={'Referer': url}, data=urlencode_postdata({
'nTitleNo': video_id,
'nApiLevel': 10,
}))['data']
if traverse_obj(data, ('code', {int})) == -6221:
raise ExtractorError('The VOD does not exist', expected=True)
query = {
data = self._download_json(
'https://api.m.afreecatv.com/station/video/a/view', video_id,
headers={'Referer': url}, data=urlencode_postdata({
'nTitleNo': video_id,
'nStationNo': data['station_no'],
'nBbsNo': data['bbs_no'],
}
if partial_view:
query['partialView'] = 'SKIP_ADULT'
if adult_view:
query['adultView'] = 'ADULT_VIEW'
video_xml = self._download_xml(
'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php',
video_id, 'Downloading video info XML%s'
% (' (skipping adult)' if partial_view else ''),
video_id, headers={
'Referer': url,
}, query=query)
'nApiLevel': 10,
}))['data']
flag = xpath_text(video_xml, './track/flag', 'flag', default=None)
if flag and flag == 'SUCCEED':
break
if flag == 'PARTIAL_ADULT':
self.report_warning(
'In accordance with local laws and regulations, underage users are restricted from watching adult content. '
'Only content suitable for all ages will be downloaded. '
'Provide account credentials if you wish to download restricted content.')
partial_view = True
continue
elif flag == 'ADULT':
if not adult_view:
adult_view = True
continue
error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.'
else:
error = flag
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error), expected=True)
else:
raise ExtractorError('Unable to download video info')
error_code = traverse_obj(data, ('code', {int}))
if error_code == -6221:
raise ExtractorError('The VOD does not exist', expected=True)
elif error_code == -6205:
raise ExtractorError('This VOD is private', expected=True)
video_element = video_xml.findall('./track/video')[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
'Video %s does not exist' % video_id, expected=True)
video_url = video_element.text.strip()
title = xpath_text(video_xml, './track/title', 'title', fatal=True)
uploader = xpath_text(video_xml, './track/nickname', 'uploader')
uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
duration = int_or_none(xpath_text(
video_xml, './track/duration', 'duration'))
thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
common_entry = {
'uploader': uploader,
'uploader_id': uploader_id,
'thumbnail': thumbnail,
}
info = common_entry.copy()
info.update({
'id': video_id,
'title': title,
'duration': duration,
common_info = traverse_obj(data, {
'title': ('title', {str}),
'uploader': ('writer_nick', {str}),
'uploader_id': ('bj_id', {str}),
'duration': ('total_file_duration', {functools.partial(int_or_none, scale=1000)}),
'thumbnail': ('thumb', {url_or_none}),
})
if not video_url:
entries = []
file_elements = video_element.findall('./file')
one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1):
file_url = url_or_none(file_element.text)
if not file_url:
continue
key = file_element.get('key', '')
upload_date = unified_strdate(self._search_regex(
r'^(\d{8})_', key, 'upload date', default=None))
if upload_date is not None:
# sometimes the upload date isn't included in the file name
# instead, another random ID is, which may parse as a valid
# date but be wildly out of a reasonable range
parsed_date = date_from_str(upload_date)
if parsed_date.year < 2000 or parsed_date.year >= 2100:
upload_date = None
file_duration = int_or_none(file_element.get('duration'))
format_id = key if key else '%s_%s' % (video_id, file_num)
if determine_ext(file_url) == 'm3u8':
formats = self._extract_m3u8_formats(
file_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls',
note='Downloading part %d m3u8 information' % file_num)
else:
formats = [{
'url': file_url,
'format_id': 'http',
}]
if not formats and not self.get_param('ignore_no_formats'):
continue
file_info = common_entry.copy()
file_info.update({
'id': format_id,
'title': title if one else '%s (part %d)' % (title, file_num),
'upload_date': upload_date,
'duration': file_duration,
'formats': formats,
entries = []
for file_num, file_element in enumerate(
traverse_obj(data, ('files', lambda _, v: url_or_none(v['file']))), start=1):
file_url = file_element['file']
if determine_ext(file_url) == 'm3u8':
formats = self._extract_m3u8_formats(
file_url, video_id, 'mp4', m3u8_id='hls',
note=f'Downloading part {file_num} m3u8 information')
else:
formats = [{
'url': file_url,
'format_id': 'http',
}]
entries.append({
**common_info,
'id': file_element.get('file_info_key') or f'{video_id}_{file_num}',
'title': f'{common_info.get("title") or "Untitled"} (part {file_num})',
'formats': formats,
**traverse_obj(file_element, {
'duration': ('duration', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('file_start', {unified_timestamp}),
})
entries.append(file_info)
entries_info = info.copy()
entries_info.update({
'_type': 'multi_video',
'entries': entries,
})
return entries_info
info = {
'id': video_id,
'title': title,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
'thumbnail': thumbnail,
}
if determine_ext(video_url) == 'm3u8':
info['formats'] = self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
else:
app, playpath = video_url.split('mp4:')
info.update({
'url': app,
'ext': 'flv',
'play_path': 'mp4:' + playpath,
'rtmp_live': True, # downloading won't end without this
})
return info
if traverse_obj(data, ('adult_status', {str})) == 'notLogin':
if not entries:
self.raise_login_required(
'Only users older than 19 are able to watch this video', method='password')
self.report_warning(
'In accordance with local laws and regulations, underage users are '
'restricted from watching adult content. Only content suitable for all '
f'ages will be downloaded. {self._login_hint("password")}')
if not entries and traverse_obj(data, ('sub_upload_type', {str})):
self.raise_login_required('This VOD is for subscribers only', method='password')
if len(entries) == 1:
return {
**entries[0],
'title': common_info.get('title'),
}
common_info['timestamp'] = traverse_obj(entries, (..., 'timestamp'), get_all=False)
return self.playlist_result(entries, video_id, multi_video=True, **common_info)
class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE
class AfreecaTVLiveIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv:live'
IE_DESC = 'afreecatv.com livestreams'
_VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?'
_TESTS = [{
'url': 'https://play.afreecatv.com/pyh3646/237852185',
@@ -347,77 +270,97 @@ class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE
},
'skip': 'Livestream has ended',
}, {
'url': 'http://play.afreeca.com/pyh3646/237852185',
'url': 'https://play.afreecatv.com/pyh3646/237852185',
'only_matching': True,
}, {
'url': 'http://play.afreeca.com/pyh3646',
'url': 'https://play.afreecatv.com/pyh3646',
'only_matching': True,
}]
_LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php'
_WORKING_CDNS = [
'gcp_cdn', # live-global-cdn-v02.afreecatv.com
'gs_cdn_pc_app', # pc-app.stream.afreecatv.com
'gs_cdn_mobile_web', # mobile-web.stream.afreecatv.com
'gs_cdn_pc_web', # pc-web.stream.afreecatv.com
]
_BAD_CDNS = [
'gs_cdn', # chromecast.afreeca.gscdn.com (cannot resolve)
'gs_cdn_chromecast', # chromecast.stream.afreecatv.com (HTTP Error 400)
'azure_cdn', # live-global-cdn-v01.afreecatv.com (cannot resolve)
'aws_cf', # live-global-cdn-v03.afreecatv.com (cannot resolve)
'kt_cdn', # kt.stream.afreecatv.com (HTTP Error 400)
]
_QUALITIES = ('sd', 'hd', 'hd2k', 'original')
def _extract_formats(self, channel_info, broadcast_no, aid):
stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
# If user has not passed CDN IDs, try API-provided CDN ID followed by other working CDN IDs
default_cdn_ids = orderedSet([
*traverse_obj(channel_info, ('CDN', {str}, all, lambda _, v: v not in self._BAD_CDNS)),
*self._WORKING_CDNS,
])
cdn_ids = self._configuration_arg('cdn', default_cdn_ids)
for attempt, cdn_id in enumerate(cdn_ids, start=1):
m3u8_url = traverse_obj(self._download_json(
urljoin(stream_base_url, 'broad_stream_assign.html'), broadcast_no,
f'Downloading {cdn_id} stream info', f'Unable to download {cdn_id} stream info',
fatal=False, query={
'return_type': cdn_id,
'broad_key': f'{broadcast_no}-common-master-hls',
}), ('view_url', {url_or_none}))
try:
return self._extract_m3u8_formats(
m3u8_url, broadcast_no, 'mp4', m3u8_id='hls', query={'aid': aid},
headers={'Referer': 'https://play.afreecatv.com/'})
except ExtractorError as e:
if attempt == len(cdn_ids):
raise
self.report_warning(
f'{e.cause or e.msg}. Retrying... (attempt {attempt} of {len(cdn_ids)})')
def _real_extract(self, url):
broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno')
password = self.get_param('videopassword')
channel_info = traverse_obj(self._download_json(
self._LIVE_API_URL, broadcaster_id, data=urlencode_postdata({'bid': broadcaster_id})),
('CHANNEL', {dict})) or {}
info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False,
data=urlencode_postdata({'bid': broadcaster_id})) or {}
channel_info = info.get('CHANNEL') or {}
broadcaster_id = channel_info.get('BJID') or broadcaster_id
broadcast_no = channel_info.get('BNO') or broadcast_no
password_protected = channel_info.get('BPWD')
if not broadcast_no:
raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True)
if password_protected == 'Y' and password is None:
raise UserNotLive(video_id=broadcaster_id)
password = self.get_param('videopassword')
if channel_info.get('BPWD') == 'Y' and password is None:
raise ExtractorError(
'This livestream is protected by a password, use the --video-password option',
expected=True)
formats = []
quality_key = qualities(self._QUALITIES)
for quality_str in self._QUALITIES:
params = {
token_info = traverse_obj(self._download_json(
self._LIVE_API_URL, broadcast_no, 'Downloading access token for stream',
'Unable to download access token for stream', data=urlencode_postdata(filter_dict({
'bno': broadcast_no,
'stream_type': 'common',
'type': 'aid',
'quality': quality_str,
}
if password is not None:
params['pwd'] = password
aid_response = self._download_json(
self._LIVE_API_URL, broadcast_no, fatal=False,
data=urlencode_postdata(params),
note=f'Downloading access token for {quality_str} stream',
errnote=f'Unable to download access token for {quality_str} stream')
aid = traverse_obj(aid_response, ('CHANNEL', 'AID'))
if not aid:
continue
'quality': 'master',
'pwd': password,
}))), ('CHANNEL', {dict})) or {}
aid = token_info.get('AID')
if not aid:
result = token_info.get('RESULT')
if result == 0:
raise ExtractorError('This livestream has ended', expected=True)
elif result == -6:
self.raise_login_required('This livestream is for subscribers only', method='password')
raise ExtractorError('Unable to extract access token')
stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
stream_info = self._download_json(
f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False,
query={
'return_type': channel_info.get('CDN', 'gcp_cdn'),
'broad_key': f'{broadcast_no}-common-{quality_str}-hls',
},
note=f'Downloading metadata for {quality_str} stream',
errnote=f'Unable to download metadata for {quality_str} stream') or {}
formats = self._extract_formats(channel_info, broadcast_no, aid)
if stream_info.get('view_url'):
formats.append({
'format_id': quality_str,
'url': update_url_query(stream_info['view_url'], {'aid': aid}),
'ext': 'mp4',
'protocol': 'm3u8',
'quality': quality_key(quality_str),
})
station_info = self._download_json(
station_info = traverse_obj(self._download_json(
'https://st.afreecatv.com/api/get_station_status.php', broadcast_no,
query={'szBjId': broadcaster_id}, fatal=False,
note='Downloading channel metadata', errnote='Unable to download channel metadata') or {}
'Downloading channel metadata', 'Unable to download channel metadata',
query={'szBjId': broadcaster_id}, fatal=False), {dict}) or {}
return {
'id': broadcast_no,
@@ -427,6 +370,7 @@ class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE
'timestamp': unified_timestamp(station_info.get('broad_start')),
'formats': formats,
'is_live': True,
'http_headers': {'Referer': url},
}

View File

@@ -1,5 +1,5 @@
import functools
import re
from functools import partial
from .common import InfoExtractor
from ..utils import (
@@ -349,7 +349,7 @@ class ARDBetaMediathekIE(InfoExtractor):
r'(?P<title>.*)',
]
return traverse_obj(patterns, (..., {partial(re.match, string=title)}, {
return traverse_obj(patterns, (..., {functools.partial(re.match, string=title)}, {
'season_number': ('season_number', {int_or_none}),
'episode_number': ('episode_number', {int_or_none}),
'episode': ((

View File

@@ -0,0 +1,154 @@
import functools
from .common import InfoExtractor
from ..utils import str_or_none, url_or_none
from ..utils.traversal import traverse_obj
class AsobiStageIE(InfoExtractor):
IE_DESC = 'ASOBISTAGE (アソビステージ)'
_VALID_URL = r'https?://asobistage\.asobistore\.jp/event/(?P<id>(?P<event>\w+)/(?P<type>archive|player)/(?P<slug>\w+))(?:[?#]|$)'
_TESTS = [{
'url': 'https://asobistage.asobistore.jp/event/315passionhour_2022summer/archive/frame',
'info_dict': {
'id': '315passionhour_2022summer/archive/frame',
'title': '315プロダクションプレゼンツ 315パッションアワー!!!',
'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': 'edff52f2',
'ext': 'mp4',
'title': '315passion_FRAME_only',
'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
},
}],
}, {
'url': 'https://asobistage.asobistore.jp/event/idolmaster_idolworld2023_goods/archive/live',
'info_dict': {
'id': 'idolmaster_idolworld2023_goods/archive/live',
'title': 'md5:378510b6e830129d505885908bd6c576',
'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '3aef7110',
'ext': 'mp4',
'title': 'asobistore_station_1020_serverREC',
'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
},
}],
}, {
'url': 'https://asobistage.asobistore.jp/event/sidem_fclive_bpct/archive/premium_hc',
'playlist_count': 4,
'info_dict': {
'id': 'sidem_fclive_bpct/archive/premium_hc',
'title': '315 Production presents FNTASTIC COMBINATION LIVE BRAINPOWER!!/CONNECTIME!!!!',
'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
},
}, {
'url': 'https://asobistage.asobistore.jp/event/ijigenfes_utagassen/player/day1',
'only_matching': True,
}]
_API_HOST = 'https://asobistage-api.asobistore.jp'
_HEADERS = {}
_is_logged_in = False
@functools.cached_property
def _owned_tickets(self):
owned_tickets = set()
if not self._is_logged_in:
return owned_tickets
for path, name in [
('api/v1/purchase_history/list', 'ticket purchase history'),
('api/v1/serialcode/list', 'redemption history'),
]:
response = self._download_json(
f'{self._API_HOST}/{path}', None, f'Downloading {name}',
f'Unable to download {name}', expected_status=400)
if traverse_obj(response, ('payload', 'error_message'), 'error') == 'notlogin':
self._is_logged_in = False
break
owned_tickets.update(
traverse_obj(response, ('payload', 'value', ..., 'digital_product_id', {str_or_none})))
return owned_tickets
def _get_available_channel_id(self, channel):
channel_id = traverse_obj(channel, ('chennel_vspf_id', {str}))
if not channel_id:
return None
# if rights_type_id == 6, then 'No conditions (no login required - non-members are OK)'
if traverse_obj(channel, ('viewrights', lambda _, v: v['rights_type_id'] == 6)):
return channel_id
available_tickets = traverse_obj(channel, (
'viewrights', ..., ('tickets', 'serialcodes'), ..., 'digital_product_id', {str_or_none}))
if not self._owned_tickets.intersection(available_tickets):
self.report_warning(
f'You are not a ticketholder for "{channel.get("channel_name") or channel_id}"')
return None
return channel_id
def _real_initialize(self):
if self._get_cookies(self._API_HOST):
self._is_logged_in = True
token = self._download_json(
f'{self._API_HOST}/api/v1/vspf/token', None, 'Getting token', 'Unable to get token')
self._HEADERS['Authorization'] = f'Bearer {token}'
def _real_extract(self, url):
video_id, event, type_, slug = self._match_valid_url(url).group('id', 'event', 'type', 'slug')
video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
webpage = self._download_webpage(url, video_id)
event_data = traverse_obj(
self._search_nextjs_data(webpage, video_id, default='{}'),
('props', 'pageProps', 'eventCMSData', {
'title': ('event_name', {str}),
'thumbnail': ('event_thumbnail_image', {url_or_none}),
}))
available_channels = traverse_obj(self._download_json(
f'https://asobistage.asobistore.jp/cdn/v101/events/{event}/{video_type}.json',
video_id, 'Getting channel list', 'Unable to get channel list'), (
video_type, lambda _, v: v['broadcast_slug'] == slug,
'channels', lambda _, v: v['chennel_vspf_id'] != '00000'))
entries = []
for channel_id in traverse_obj(available_channels, (..., {self._get_available_channel_id})):
if video_type == 'archives':
channel_json = self._download_json(
f'https://survapi.channel.or.jp/proxy/v1/contents/{channel_id}/get_by_cuid', channel_id,
'Getting archive channel info', 'Unable to get archive channel info', fatal=False,
headers=self._HEADERS)
channel_data = traverse_obj(channel_json, ('ex_content', {
'm3u8_url': 'streaming_url',
'title': 'title',
'thumbnail': ('thumbnail', 'url'),
}))
else: # video_type == 'broadcasts'
channel_json = self._download_json(
f'https://survapi.channel.or.jp/ex/events/{channel_id}', channel_id,
'Getting live channel info', 'Unable to get live channel info', fatal=False,
headers=self._HEADERS, query={'embed': 'channel'})
channel_data = traverse_obj(channel_json, ('data', {
'm3u8_url': ('Channel', 'Custom_live_url'),
'title': 'Name',
'thumbnail': 'Poster_url',
}))
entries.append({
'id': channel_id,
'title': channel_data.get('title'),
'formats': self._extract_m3u8_formats(channel_data.get('m3u8_url'), channel_id, fatal=False),
'is_live': video_type == 'broadcasts',
'thumbnail': url_or_none(channel_data.get('thumbnail')),
})
if not self._is_logged_in and not entries:
self.raise_login_required()
return self.playlist_result(entries, video_id, **event_data)

View File

@@ -1,4 +1,4 @@
import datetime
import datetime as dt
from .common import InfoExtractor
from ..utils import (
@@ -71,9 +71,9 @@ class ATVAtIE(InfoExtractor):
content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']}
for id, content in enumerate(contentResource)]
time_of_request = datetime.datetime.now()
not_before = time_of_request - datetime.timedelta(minutes=5)
expire = time_of_request + datetime.timedelta(minutes=5)
time_of_request = dt.datetime.now()
not_before = time_of_request - dt.timedelta(minutes=5)
expire = time_of_request + dt.timedelta(minutes=5)
payload = {
'content_ids': {
content_id: content_ids,

View File

@@ -1,4 +1,4 @@
import datetime
import datetime as dt
import hashlib
import hmac
@@ -12,7 +12,7 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with
def _aws_execute_api(self, aws_dict, video_id, query=None):
query = query or {}
amz_date = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ')
amz_date = dt.datetime.now(dt.timezone.utc).strftime('%Y%m%dT%H%M%SZ')
date = amz_date[:8]
headers = {
'Accept': 'application/json',

View File

@@ -1,4 +1,4 @@
from functools import partial
import functools
from .common import InfoExtractor
from ..utils import (
@@ -50,7 +50,7 @@ class BibelTVBaseIE(InfoExtractor):
**traverse_obj(data, {
'title': 'title',
'description': 'description',
'duration': ('duration', {partial(int_or_none, scale=1000)}),
'duration': ('duration', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('schedulingStart', {parse_iso8601}),
'season_number': 'seasonNumber',
'episode_number': 'episodeNumber',

View File

@@ -3,6 +3,7 @@ import urllib.parse
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_iso8601,
update_url_query,
url_or_none,
@@ -11,8 +12,8 @@ from ..utils.traversal import traverse_obj
class BoxIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/?#]+)/file/(?P<id>\d+)'
_TEST = {
_VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?'
_TESTS = [{
'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538',
'md5': '1f81b2fd3960f38a40a3b8823e5fcd43',
'info_dict': {
@@ -25,14 +26,36 @@ class BoxIE(InfoExtractor):
'uploader_id': '235196876',
},
'params': {'skip_download': 'dash fragment too small'},
}
}, {
'url': 'https://utexas.app.box.com/s/2x6vanv85fdl8j2eqlcxmv0gp1wvps6e',
'info_dict': {
'id': '787379022466',
'ext': 'mp4',
'title': 'Webinar recording: Take the Leap!.mp4',
'uploader': 'Patricia Mosele',
'timestamp': 1615824864,
'upload_date': '20210315',
'uploader_id': '239068974',
},
'params': {'skip_download': 'dash fragment too small'},
}]
def _real_extract(self, url):
shared_name, file_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, file_id)
request_token = self._parse_json(self._search_regex(
r'Box\.config\s*=\s*({.+?});', webpage,
'Box config'), file_id)['requestToken']
webpage = self._download_webpage(url, file_id or shared_name)
if not file_id:
post_stream_data = self._search_json(
r'Box\.postStreamData\s*=', webpage, 'Box post-stream data', shared_name)
shared_item = traverse_obj(
post_stream_data, ('/app-api/enduserapp/shared-item', {dict})) or {}
if shared_item.get('itemType') != 'file':
raise ExtractorError('The requested resource is not a file', expected=True)
file_id = str(shared_item['itemID'])
request_token = self._search_json(
r'Box\.config\s*=', webpage, 'Box config', file_id)['requestToken']
access_token = self._download_json(
'https://app.box.com/app-api/enduserapp/elements/tokens', file_id,
'Downloading token JSON metadata',

View File

@@ -1,5 +1,5 @@
import functools
import re
from functools import partial
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
@@ -115,9 +115,9 @@ class BundestagIE(InfoExtractor):
note='Downloading metadata overlay', fatal=False,
), {
'title': (
{partial(get_element_text_and_html_by_tag, 'h3')}, 0,
{partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
{functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0,
{functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
}))
return result

View File

@@ -151,7 +151,7 @@ class CBCIE(InfoExtractor):
class CBCPlayerIE(InfoExtractor):
IE_NAME = 'cbc.ca:player'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193',
'md5': '64d25f841ddf4ddb28a235338af32e2c',
@@ -165,9 +165,52 @@ class CBCPlayerIE(InfoExtractor):
'uploader': 'CBCC-NEW',
},
'skip': 'Geo-restricted to Canada and no longer available',
}, {
'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2657631896',
'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
'info_dict': {
'id': '2657631896',
'ext': 'mp3',
'title': 'CBC Montreal is organizing its first ever community hackathon!',
'description': 'md5:dd3b692f0a139b0369943150bd1c46a9',
'timestamp': 1425704400,
'upload_date': '20150307',
'uploader': 'CBCC-NEW',
'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
'chapters': [],
'duration': 494.811,
'categories': ['AudioMobile/All in a Weekend Montreal'],
'tags': 'count:8',
'location': 'Quebec',
'series': 'All in a Weekend Montreal',
'season': 'Season 2015',
'season_number': 2015,
'media_type': 'Excerpt',
},
}, {
'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062',
'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
'ext': 'mp4',
'title': 'Cancer survivor four times over',
'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
'timestamp': 1320410746,
'upload_date': '20111104',
'uploader': 'CBCC-NEW',
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
'chapters': [],
'duration': 186.867,
'series': 'CBC News: Windsor at 6:00',
'categories': ['News/Canada/Windsor'],
'location': 'Windsor',
'tags': ['cancer'],
'creators': ['Allison Johnson'],
'media_type': 'Excerpt',
},
}, {
# Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
'url': 'http://www.cbc.ca/player/play/2657631896',
'url': 'https://www.cbc.ca/player/play/1.2985700',
'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
'info_dict': {
'id': '2657631896',
@@ -189,7 +232,7 @@ class CBCPlayerIE(InfoExtractor):
'media_type': 'Excerpt',
},
}, {
'url': 'http://www.cbc.ca/player/play/2164402062',
'url': 'https://www.cbc.ca/player/play/1.1711287',
'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
@@ -206,38 +249,53 @@ class CBCPlayerIE(InfoExtractor):
'categories': ['News/Canada/Windsor'],
'location': 'Windsor',
'tags': ['cancer'],
'creator': 'Allison Johnson',
'creators': ['Allison Johnson'],
'media_type': 'Excerpt',
},
}, {
# Has subtitles
# These broadcasts expire after ~1 month, can find new test URL here:
# https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
'url': 'http://www.cbc.ca/player/play/2284799043667',
'md5': '9b49f0839e88b6ec0b01d840cf3d42b5',
'url': 'https://www.cbc.ca/player/play/1.7159484',
'md5': '6ed6cd0fc2ef568d2297ba68a763d455',
'info_dict': {
'id': '2284799043667',
'id': '2324213316001',
'ext': 'mp4',
'title': 'The National | Hockey coach charged, Green grants, Safer drugs',
'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa',
'timestamp': 1700272800,
'duration': 2718.833,
'title': 'The National | School boards sue social media giants',
'description': 'md5:4b4db69322fa32186c3ce426da07402c',
'timestamp': 1711681200,
'duration': 2743.400,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg',
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg',
'uploader': 'CBCC-NEW',
'chapters': 'count:5',
'upload_date': '20231118',
'upload_date': '20240329',
'categories': 'count:4',
'series': 'The National - Full Show',
'tags': 'count:1',
'creator': 'News',
'creators': ['News'],
'location': 'Canada',
'media_type': 'Full Program',
},
}, {
'url': 'cbcplayer:1.7159484',
'only_matching': True,
}, {
'url': 'cbcplayer:2164402062',
'only_matching': True,
}, {
'url': 'http://www.cbc.ca/player/play/2657631896',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
if '.' in video_id:
webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id)
video_id = self._search_json(
r'window\.__INITIAL_STATE__\s*=', webpage,
'initial state', video_id)['video']['currentClip']['mediaId']
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',

View File

@@ -1,6 +1,6 @@
import base64
import codecs
import datetime
import datetime as dt
import hashlib
import hmac
import json
@@ -134,7 +134,7 @@ class CDAIE(InfoExtractor):
self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5:
if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
return
@@ -154,7 +154,7 @@ class CDAIE(InfoExtractor):
})
self.cache.store(self._BEARER_CACHE, username, {
'token': token_res['access_token'],
'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(),
'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
})
self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'

View File

@@ -37,6 +37,7 @@ from ..networking.exceptions import (
IncompleteRead,
network_exceptions,
)
from ..networking.impersonate import ImpersonateTarget
from ..utils import (
IDENTITY,
JSON_LD_RE,
@@ -170,12 +171,12 @@ class InfoExtractor:
Automatically calculated from width and height
* dynamic_range The dynamic range of the video. One of:
"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
* tbr Average bitrate of audio and video in KBit/s
* abr Average audio bitrate in KBit/s
* tbr Average bitrate of audio and video in kbps (1000 bits/sec)
* abr Average audio bitrate in kbps (1000 bits/sec)
* acodec Name of the audio codec in use
* asr Audio sampling rate in Hertz
* audio_channels Number of audio channels
* vbr Average video bitrate in KBit/s
* vbr Average video bitrate in kbps (1000 bits/sec)
* fps Frame rate
* vcodec Name of the video codec in use
* container Name of the container format
@@ -246,7 +247,8 @@ class InfoExtractor:
* downloader_options A dictionary of downloader options
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
* ffmpeg_args Extra arguments for ffmpeg downloader
* ffmpeg_args Extra arguments for ffmpeg downloader (input)
* ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
* is_dash_periods Whether the format is a result of merging
multiple DASH periods.
RTMP formats can also have the additional fields: page_url,
@@ -817,7 +819,7 @@ class InfoExtractor:
else:
return err.status in variadic(expected_status)
def _create_request(self, url_or_request, data=None, headers=None, query=None):
def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
if isinstance(url_or_request, urllib.request.Request):
self._downloader.deprecation_warning(
'Passing a urllib.request.Request to _create_request() is deprecated. '
@@ -826,10 +828,11 @@ class InfoExtractor:
elif not isinstance(url_or_request, Request):
url_or_request = Request(url_or_request)
url_or_request.update(data=data, headers=headers, query=query)
url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
return url_or_request
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
"""
Return the response handle.
@@ -860,8 +863,31 @@ class InfoExtractor:
headers = (headers or {}).copy()
headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
extensions = {}
if impersonate in (True, ''):
impersonate = ImpersonateTarget()
requested_targets = [
t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
for t in variadic(impersonate)
] if impersonate else []
available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
if available_target:
extensions['impersonate'] = available_target
elif requested_targets:
message = 'The extractor is attempting impersonation, but '
message += (
'no impersonate target is available' if not str(impersonate)
else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
info_msg = ('see https://github.com/yt-dlp/yt-dlp#impersonation '
'for information on installing the required dependencies')
if require_impersonation:
raise ExtractorError(f'{message}; {info_msg}', expected=True)
self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
try:
return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
except network_exceptions as err:
if isinstance(err, HTTPError):
if self.__can_accept_status_code(err, expected_status):
@@ -880,13 +906,14 @@ class InfoExtractor:
return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
encoding=None, data=None, headers={}, query={}, expected_status=None):
encoding=None, data=None, headers={}, query={}, expected_status=None,
impersonate=None, require_impersonation=False):
"""
Return a tuple (page content as string, URL handle).
Arguments:
url_or_request -- plain text URL as a string or
a urllib.request.Request object
a yt_dlp.networking.Request object
video_id -- Video/playlist/item identifier (string)
Keyword arguments:
@@ -911,13 +938,22 @@ class InfoExtractor:
returning True if it should be accepted
Note that this argument does not affect success status codes (2xx)
which are always accepted.
impersonate -- the impersonate target. Can be any of the following entities:
- an instance of yt_dlp.networking.impersonate.ImpersonateTarget
- a string in the format of CLIENT[:OS]
- a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
- a boolean value; True means any impersonate target is sufficient
require_impersonation -- flag to toggle whether the request should raise an error
if impersonation is not possible (bool, default: False)
"""
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, str):
url_or_request = url_or_request.partition('#')[0]
urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
headers=headers, query=query, expected_status=expected_status,
impersonate=impersonate, require_impersonation=require_impersonation)
if urlh is False:
assert not fatal
return False
@@ -1046,17 +1082,20 @@ class InfoExtractor:
return getattr(ie, parser)(content, *args, **kwargs)
def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
impersonate=None, require_impersonation=False):
res = self._download_webpage_handle(
url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
data=data, headers=headers, query=query, expected_status=expected_status)
data=data, headers=headers, query=query, expected_status=expected_status,
impersonate=impersonate, require_impersonation=require_impersonation)
if res is False:
return res
content, urlh = res
return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
filename = self._request_dump_filename(url_or_request.url, video_id)
@@ -1079,6 +1118,8 @@ class InfoExtractor:
'headers': headers,
'query': query,
'expected_status': expected_status,
'impersonate': impersonate,
'require_impersonation': require_impersonation,
}
if parser is None:
kwargs.pop('transform_source')

View File

@@ -1,4 +1,5 @@
import base64
import uuid
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
@@ -7,12 +8,11 @@ from ..utils import (
float_or_none,
format_field,
int_or_none,
join_nonempty,
jwt_decode_hs256,
parse_age_limit,
parse_count,
parse_iso8601,
qualities,
remove_start,
time_seconds,
traverse_obj,
url_or_none,
@@ -27,6 +27,7 @@ class CrunchyrollBaseIE(InfoExtractor):
_AUTH_HEADERS = None
_API_ENDPOINT = None
_BASIC_AUTH = None
_IS_PREMIUM = None
_CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q')
_LOCALE_LOOKUP = {
'ar': 'ar-SA',
@@ -84,11 +85,16 @@ class CrunchyrollBaseIE(InfoExtractor):
self.write_debug(f'Using cxApiParam={cx_api_param}')
CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode()
grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id'
auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH}
if self.is_logged_in:
grant_type = 'etp_rt_cookie'
else:
grant_type = 'client_id'
auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
try:
auth_response = self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode())
headers=auth_headers, data=f'grant_type={grant_type}'.encode())
except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 403:
raise ExtractorError(
@@ -97,6 +103,7 @@ class CrunchyrollBaseIE(InfoExtractor):
'and your browser\'s User-Agent (with --user-agent)', expected=True)
raise
CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...))
CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']}
CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10)
@@ -135,62 +142,72 @@ class CrunchyrollBaseIE(InfoExtractor):
raise ExtractorError(f'Unexpected response when downloading {note} JSON')
return result
def _extract_formats(self, stream_response, display_id=None):
requested_formats = self._configuration_arg('format') or ['vo_adaptive_hls']
available_formats = {}
for stream_type, streams in traverse_obj(
stream_response, (('streams', ('data', 0)), {dict.items}, ...)):
if stream_type not in requested_formats:
def _extract_chapters(self, internal_id):
# if no skip events are available, a 403 xml error is returned
skip_events = self._download_json(
f'https://static.crunchyroll.com/skip-events/production/{internal_id}.json',
internal_id, note='Downloading chapter info', fatal=False, errnote=False)
if not skip_events:
return None
chapters = []
for event in ('recap', 'intro', 'credits', 'preview'):
start = traverse_obj(skip_events, (event, 'start', {float_or_none}))
end = traverse_obj(skip_events, (event, 'end', {float_or_none}))
# some chapters have no start and/or ending time, they will just be ignored
if start is None or end is None:
continue
for stream in traverse_obj(streams, lambda _, v: v['url']):
hardsub_lang = stream.get('hardsub_locale') or ''
format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url'])
chapters.append({'title': event.capitalize(), 'start_time': start, 'end_time': end})
return chapters
def _extract_stream(self, identifier, display_id=None):
if not display_id:
display_id = identifier
self._update_auth()
stream_response = self._download_json(
f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play',
display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS)
available_formats = {'': ('', '', stream_response['url'])}
for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])):
available_formats[hardsub_lang] = (f'hardsub-{hardsub_lang}', hardsub_lang, stream['url'])
requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
if '' in available_formats and 'all' not in requested_hardsubs:
hardsub_langs = [lang for lang in available_formats if lang]
if hardsub_langs and 'all' not in requested_hardsubs:
full_format_langs = set(requested_hardsubs)
self.to_screen(f'Available hardsub languages: {", ".join(hardsub_langs)}')
self.to_screen(
'To get all formats of a hardsub language, use '
'To extract formats of a hardsub language, use '
'"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". '
'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info',
only_once=True)
else:
full_format_langs = set(map(str.lower, available_formats))
audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False)
audio_locale = traverse_obj(stream_response, ('audioLocale', {str}))
hardsub_preference = qualities(requested_hardsubs[::-1])
formats = []
for stream_type, format_id, hardsub_lang, stream_url in available_formats.values():
if stream_type.endswith('hls'):
if hardsub_lang.lower() in full_format_langs:
adaptive_formats = self._extract_m3u8_formats(
stream_url, display_id, 'mp4', m3u8_id=format_id,
fatal=False, note=f'Downloading {format_id} HLS manifest')
else:
adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),)
elif stream_type.endswith('dash'):
adaptive_formats = self._extract_mpd_formats(
stream_url, display_id, mpd_id=format_id,
fatal=False, note=f'Downloading {format_id} MPD manifest')
formats, subtitles = [], {}
for format_id, hardsub_lang, stream_url in available_formats.values():
if hardsub_lang.lower() in full_format_langs:
adaptive_formats, dash_subs = self._extract_mpd_formats_and_subtitles(
stream_url, display_id, mpd_id=format_id, headers=CrunchyrollBaseIE._AUTH_HEADERS,
fatal=False, note=f'Downloading {f"{format_id} " if hardsub_lang else ""}MPD manifest')
self._merge_subtitles(dash_subs, target=subtitles)
else:
self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True)
continue
continue # XXX: Update this if/when meta mpd formats are working
for f in adaptive_formats:
if f.get('acodec') != 'none':
f['language'] = audio_locale
f['quality'] = hardsub_preference(hardsub_lang.lower())
formats.extend(adaptive_formats)
return formats
for locale, subtitle in traverse_obj(stream_response, (('subtitles', 'captions'), {dict.items}, ...)):
subtitles.setdefault(locale, []).append(traverse_obj(subtitle, {'url': 'url', 'ext': 'format'}))
def _extract_subtitles(self, data):
subtitles = {}
for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)):
subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})]
return subtitles
return formats, subtitles
class CrunchyrollCmsBaseIE(CrunchyrollBaseIE):
@@ -245,7 +262,11 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
'like_count': int,
'dislike_count': int,
},
'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'},
'params': {
'skip_download': 'm3u8',
'extractor_args': {'crunchyrollbeta': {'hardsub': ['de-DE']}},
'format': 'bv[format_id~=hardsub]',
},
}, {
# Premium only
'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR',
@@ -306,6 +327,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
},
'params': {'skip_download': 'm3u8'},
'skip': 'no longer exists',
}, {
'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6',
'info_dict': {
@@ -359,31 +381,15 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
else:
raise ExtractorError(f'Unknown object type {object_type}')
# There might be multiple audio languages for one object (`<object>_metadata.versions`),
# so we need to get the id from `streams_link` instead or we dont know which language to choose
streams_link = response.get('streams_link')
if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
message = f'This {object_type} is for premium members only'
if self.is_logged_in:
raise ExtractorError(message, expected=True)
self.raise_login_required(message)
# We need go from unsigned to signed api to avoid getting soft banned
stream_response = self._call_cms_api_signed(remove_start(
streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info')
result['formats'] = self._extract_formats(stream_response, internal_id)
result['subtitles'] = self._extract_subtitles(stream_response)
result['formats'], result['subtitles'] = self._extract_stream(internal_id)
# if no intro chapter is available, a 403 without usable data is returned
intro_chapter = self._download_json(
f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json',
internal_id, note='Downloading chapter info', fatal=False, errnote=False)
if isinstance(intro_chapter, dict):
result['chapters'] = [{
'title': 'Intro',
'start_time': float_or_none(intro_chapter.get('startTime')),
'end_time': float_or_none(intro_chapter.get('endTime')),
}]
result['chapters'] = self._extract_chapters(internal_id)
def calculate_count(item):
return parse_count(''.join((item['displayed'], item.get('unit') or '')))
@@ -512,7 +518,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
'display_id': 'egaono-hana',
'title': 'Egaono Hana',
'track': 'Egaono Hana',
'artist': 'Goose house',
'artists': ['Goose house'],
'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
'genres': ['J-Pop'],
},
@@ -525,11 +531,12 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
'display_id': 'crossing-field',
'title': 'Crossing Field',
'track': 'Crossing Field',
'artist': 'LiSA',
'artists': ['LiSA'],
'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
'genres': ['Anime'],
},
'params': {'skip_download': 'm3u8'},
'skip': 'no longer exists',
}, {
'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135',
'info_dict': {
@@ -538,7 +545,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
'display_id': 'live-is-smile-always-364joker-at-yokohama-arena',
'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
'artist': 'LiSA',
'artists': ['LiSA'],
'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
'description': 'md5:747444e7e6300907b7a43f0a0503072e',
'genres': ['J-Pop'],
@@ -566,16 +573,14 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
if not response:
raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
streams_link = response.get('streams_link')
if not streams_link and response.get('isPremiumOnly'):
if not self._IS_PREMIUM and response.get('isPremiumOnly'):
message = f'This {response.get("type") or "media"} is for premium members only'
if self.is_logged_in:
raise ExtractorError(message, expected=True)
self.raise_login_required(message)
result = self._transform_music_response(response)
stream_response = self._call_api(streams_link, internal_id, lang, 'stream info')
result['formats'] = self._extract_formats(stream_response, internal_id)
result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)
return result
@@ -587,7 +592,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
'display_id': 'slug',
'title': 'title',
'track': 'title',
'artist': ('artist', 'name'),
'artists': ('artist', 'name', all),
'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}),
'thumbnails': ('images', ..., ..., {
'url': ('source', {url_or_none}),
@@ -611,7 +616,7 @@ class CrunchyrollArtistIE(CrunchyrollBaseIE):
'info_dict': {
'id': 'MA179CB50D',
'title': 'LiSA',
'genres': ['J-Pop', 'Anime', 'Rock'],
'genres': ['Anime', 'J-Pop', 'Rock'],
'description': 'md5:16d87de61a55c3f7d6c454b73285938e',
},
'playlist_mincount': 83,

View File

@@ -65,12 +65,14 @@ class DropboxIE(InfoExtractor):
formats, subtitles, has_anonymous_download = [], {}, False
for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)):
decoded = base64.b64decode(encoded).decode('utf-8', 'ignore')
if not has_anonymous_download:
has_anonymous_download = self._search_regex(
r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False)
transcode_url = self._search_regex(
r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None)
if not transcode_url:
continue
formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4')
has_anonymous_download = self._search_regex(r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False)
break
# downloads enabled we can get the original file

View File

@@ -1,5 +1,5 @@
import json
from socket import timeout
import socket
from .common import InfoExtractor
from ..utils import (
@@ -56,7 +56,7 @@ class DTubeIE(InfoExtractor):
try:
self.to_screen('%s: Checking %s video format URL' % (video_id, format_id))
self._downloader._opener.open(video_url, timeout=5).close()
except timeout:
except socket.timeout:
self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, format_id))
continue

View File

@@ -0,0 +1,54 @@
import json
from .common import InfoExtractor
from ..utils import (
extract_attributes,
float_or_none,
get_element_html_by_id,
parse_iso8601,
)
from ..utils.traversal import traverse_obj
class FathomIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?fathom\.video/share/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://fathom.video/share/G9mkjkspnohVVZ_L5nrsoPycyWcB8y7s',
'md5': '0decd5343b8f30ae268625e79a02b60f',
'info_dict': {
'id': '47200596',
'ext': 'mp4',
'title': 'eCom Inucbator - Coaching Session',
'duration': 8125.380507,
'timestamp': 1699048914,
'upload_date': '20231103',
},
}, {
'url': 'https://fathom.video/share/mEws3bybftHL2QLymxYEDeE21vtLxGVm',
'md5': '4f5cb382126c22d1aba8a939f9c49690',
'info_dict': {
'id': '46812957',
'ext': 'mp4',
'title': 'Jon, Lawrence, Neman chat about practice',
'duration': 3571.517847,
'timestamp': 1698933600,
'upload_date': '20231102',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
props = traverse_obj(
get_element_html_by_id('app', webpage), ({extract_attributes}, 'data-page', {json.loads}, 'props'))
video_id = str(props['call']['id'])
return {
'id': video_id,
'formats': self._extract_m3u8_formats(props['call']['video_url'], video_id, 'mp4'),
**traverse_obj(props, {
'title': ('head', 'title', {str}),
'duration': ('duration', {float_or_none}),
'timestamp': ('call', 'started_at', {parse_iso8601}),
}),
}

View File

@@ -2104,22 +2104,6 @@ class GenericIE(InfoExtractor):
'age_limit': 0,
},
},
{
'note': 'JW Player embed with unicode-escape sequences in URL',
'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics',
'info_dict': {
'id': 'm',
'ext': 'mp4',
'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi',
'description': 'Mahler\'s ',
'uploader': 'www.medici.tv',
'age_limit': 0,
'thumbnail': r're:^https?://.+\.jpg',
},
'params': {
'skip_download': True,
},
},
{
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
'md5': 'e2f0a4c329f7986280b7328e24036d60',

View File

@@ -1,6 +1,6 @@
import base64
import binascii
import datetime
import datetime as dt
import hashlib
import hmac
import json
@@ -422,7 +422,7 @@ class AwsIdp:
months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
time_now = datetime.datetime.now(datetime.timezone.utc)
time_now = dt.datetime.now(dt.timezone.utc)
format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day)
time_string = time_now.strftime(format_string)
return time_string

View File

@@ -1,89 +1,143 @@
import functools
import math
import re
from .common import InfoExtractor
from ..utils import (
InAdvancePagedList,
clean_html,
int_or_none,
js_to_json,
make_archive_id,
smuggle_url,
unsmuggle_url,
url_basename,
url_or_none,
urlencode_postdata,
urljoin,
)
from ..utils.traversal import traverse_obj
class JioSaavnBaseIE(InfoExtractor):
def _extract_initial_data(self, url, audio_id):
webpage = self._download_webpage(url, audio_id)
return self._search_json(
r'window\.__INITIAL_DATA__\s*=', webpage,
'init json', audio_id, transform_source=js_to_json)
_API_URL = 'https://www.jiosaavn.com/api.php'
_VALID_BITRATES = {'16', '32', '64', '128', '320'}
class JioSaavnSongIE(JioSaavnBaseIE):
_VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk',
'md5': '3b84396d15ed9e083c3106f1fa589c04',
'info_dict': {
'id': 'OQsEfQFVUXk',
'ext': 'mp4',
'title': 'Leja Re',
'album': 'Leja Re',
'thumbnail': 'https://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg',
'duration': 205,
'view_count': int,
'release_year': 2018,
},
}, {
'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU',
'only_matching': True,
}]
_VALID_BITRATES = ('16', '32', '64', '128', '320')
def _real_extract(self, url):
audio_id = self._match_id(url)
extract_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn')
if invalid_bitrates := [br for br in extract_bitrates if br not in self._VALID_BITRATES]:
@functools.cached_property
def requested_bitrates(self):
requested_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn')
if invalid_bitrates := set(requested_bitrates) - self._VALID_BITRATES:
raise ValueError(
f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. '
+ f'Valid bitrates are: {", ".join(self._VALID_BITRATES)}')
+ f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}')
return requested_bitrates
song_data = self._extract_initial_data(url, audio_id)['song']['song']
formats = []
for bitrate in extract_bitrates:
def _extract_formats(self, song_data):
for bitrate in self.requested_bitrates:
media_data = self._download_json(
'https://www.jiosaavn.com/api.php', audio_id, f'Downloading format info for {bitrate}',
self._API_URL, song_data['id'],
f'Downloading format info for {bitrate}',
fatal=False, data=urlencode_postdata({
'__call': 'song.generateAuthToken',
'_format': 'json',
'bitrate': bitrate,
'url': song_data['encrypted_media_url'],
}))
if not media_data.get('auth_url'):
if not traverse_obj(media_data, ('auth_url', {url_or_none})):
self.report_warning(f'Unable to extract format info for {bitrate}')
continue
formats.append({
ext = media_data.get('type')
yield {
'url': media_data['auth_url'],
'ext': media_data.get('type'),
'ext': 'm4a' if ext == 'mp4' else ext,
'format_id': bitrate,
'abr': int(bitrate),
'vcodec': 'none',
}
def _extract_song(self, song_data, url=None):
info = traverse_obj(song_data, {
'id': ('id', {str}),
'title': ('song', {clean_html}),
'album': ('album', {clean_html}),
'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}),
'duration': ('duration', {int_or_none}),
'view_count': ('play_count', {int_or_none}),
'release_year': ('year', {int_or_none}),
'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}),
'webpage_url': ('perma_url', {url_or_none}),
})
if webpage_url := info.get('webpage_url') or url:
info['display_id'] = url_basename(webpage_url)
info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])]
return info
def _call_api(self, type_, token, note='API', params={}):
return self._download_json(
self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON',
query={
'__call': 'webapi.get',
'_format': 'json',
'_marker': '0',
'ctx': 'web6dot0',
'token': token,
'type': type_,
**params,
})
return {
'id': audio_id,
'formats': formats,
**traverse_obj(song_data, {
'title': ('title', 'text'),
'album': ('album', 'text'),
'thumbnail': ('image', 0, {url_or_none}),
'duration': ('duration', {int_or_none}),
'view_count': ('play_count', {int_or_none}),
'release_year': ('year', {int_or_none}),
}),
}
def _yield_songs(self, playlist_data):
for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])):
song_info = self._extract_song(song_data)
url = smuggle_url(song_info['webpage_url'], {
'id': song_data['id'],
'encrypted_media_url': song_data['encrypted_media_url'],
})
yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info)
class JioSaavnSongIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:song'
_VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk',
'md5': '3b84396d15ed9e083c3106f1fa589c04',
'info_dict': {
'id': 'IcoLuefJ',
'display_id': 'OQsEfQFVUXk',
'ext': 'm4a',
'title': 'Leja Re',
'album': 'Leja Re',
'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg',
'duration': 205,
'view_count': int,
'release_year': 2018,
'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'],
'_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'],
},
}, {
'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU',
'only_matching': True,
}]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url)
song_data = traverse_obj(smuggled_data, ({
'id': ('id', {str}),
'encrypted_media_url': ('encrypted_media_url', {str}),
}))
if 'id' in song_data and 'encrypted_media_url' in song_data:
result = {'id': song_data['id']}
else:
# only extract metadata if this is not a url_transparent result
song_data = self._call_api('song', self._match_id(url))['songs'][0]
result = self._extract_song(song_data, url)
result['formats'] = list(self._extract_formats(song_data))
return result
class JioSaavnAlbumIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:album'
_VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_',
@@ -95,11 +149,46 @@ class JioSaavnAlbumIE(JioSaavnBaseIE):
}]
def _real_extract(self, url):
album_id = self._match_id(url)
album_view = self._extract_initial_data(url, album_id)['albumView']
display_id = self._match_id(url)
album_data = self._call_api('album', display_id)
return self.playlist_from_matches(
traverse_obj(album_view, (
'modules', lambda _, x: x['key'] == 'list', 'data', ..., 'title', 'action', {str})),
album_id, traverse_obj(album_view, ('album', 'title', 'text', {str})), ie=JioSaavnSongIE,
getter=lambda x: urljoin('https://www.jiosaavn.com/', x))
return self.playlist_result(
self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str})))
class JioSaavnPlaylistIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:playlist'
_VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/s/playlist/(?:[^/?#]+/){2}(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__',
'info_dict': {
'id': 'LlJ8ZWT1ibN5084vKHRj2Q__',
'title': 'Mood English',
},
'playlist_mincount': 301,
}, {
'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-hindi/DVR,pFUOwyXqIp77B1JF,A__',
'info_dict': {
'id': 'DVR,pFUOwyXqIp77B1JF,A__',
'title': 'Mood Hindi',
},
'playlist_mincount': 801,
}]
_PAGE_SIZE = 50
def _fetch_page(self, token, page):
return self._call_api(
'playlist', token, f'playlist page {page}', {'p': page, 'n': self._PAGE_SIZE})
def _entries(self, token, first_page_data, page):
page_data = first_page_data if not page else self._fetch_page(token, page + 1)
yield from self._yield_songs(page_data)
def _real_extract(self, url):
display_id = self._match_id(url)
playlist_data = self._fetch_page(display_id, 1)
total_pages = math.ceil(int(playlist_data['list_count']) / self._PAGE_SIZE)
return self.playlist_result(InAdvancePagedList(
functools.partial(self._entries, display_id, playlist_data),
total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str})))

View File

@@ -1,4 +1,4 @@
import datetime
import datetime as dt
import urllib.parse
from .common import InfoExtractor
@@ -50,8 +50,8 @@ class JoqrAgIE(InfoExtractor):
def _extract_start_timestamp(self, video_id, is_live):
def extract_start_time_from(date_str):
dt = datetime_from_str(date_str) + datetime.timedelta(hours=9)
date = dt.strftime('%Y%m%d')
dt_ = datetime_from_str(date_str) + dt.timedelta(hours=9)
date = dt_.strftime('%Y%m%d')
start_time = self._search_regex(
r'<h3[^>]+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+\s*(\d{1,2}:\d{1,2})',
self._download_webpage(
@@ -60,7 +60,7 @@ class JoqrAgIE(InfoExtractor):
errnote=f'Failed to download program list of {date}') or '',
'start time', default=None)
if start_time:
return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00')
return unified_timestamp(f'{dt_.strftime("%Y/%m/%d")} {start_time} +09:00')
return None
start_timestamp = extract_start_time_from('today')
@@ -80,14 +80,14 @@ class JoqrAgIE(InfoExtractor):
note='Downloading metadata', errnote='Failed to download metadata')
title = self._extract_metadata('Program_name', metadata)
if title == '放送休止':
if not title or title == '放送休止':
formats = []
live_status = 'is_upcoming'
release_timestamp = self._extract_start_timestamp(video_id, False)
msg = 'This stream is not currently live'
if release_timestamp:
msg += (' and will start at '
+ datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S'))
+ dt.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S'))
self.raise_no_formats(msg, expected=True)
else:
m3u8_path = self._search_regex(

View File

@@ -13,7 +13,8 @@ from ..utils import (
class KickBaseIE(InfoExtractor):
def _real_initialize(self):
self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False)
self._request_webpage(
HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False, impersonate=True)
xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN')
if not xsrf_token:
self.write_debug('kick.com did not set XSRF-TOKEN cookie')
@@ -25,7 +26,7 @@ class KickBaseIE(InfoExtractor):
def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs):
return self._download_json(
f'https://kick.com/api/v1/{path}', display_id, note=note,
headers=merge_dicts(headers, self._API_HEADERS), **kwargs)
headers=merge_dicts(headers, self._API_HEADERS), impersonate=True, **kwargs)
class KickIE(KickBaseIE):
@@ -82,26 +83,27 @@ class KickIE(KickBaseIE):
class KickVODIE(KickBaseIE):
_VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TESTS = [{
'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35',
'md5': '73691206a6a49db25c5aa1588e6538fc',
'url': 'https://kick.com/video/58bac65b-e641-4476-a7ba-3707a35e60e3',
'md5': '3870f94153e40e7121a6e46c068b70cb',
'info_dict': {
'id': '54244b5e-050a-4df4-a013-b2433dafbe35',
'id': '58bac65b-e641-4476-a7ba-3707a35e60e3',
'ext': 'mp4',
'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links',
'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f',
'channel': 'kmack710',
'channel_id': '16278',
'uploader': 'Kmack710',
'uploader_id': '16412',
'upload_date': '20221206',
'timestamp': 1670318289,
'duration': 40104.0,
'title': '🤠REBIRTH IS BACK!!!!🤠!stake CODE JAREDFPS 🤠',
'description': 'md5:02b0c46f9b4197fb545ab09dddb85b1d',
'channel': 'jaredfps',
'channel_id': '26608',
'uploader': 'JaredFPS',
'uploader_id': '26799',
'upload_date': '20240402',
'timestamp': 1712097108,
'duration': 33859.0,
'thumbnail': r're:^https?://.*\.jpg',
'categories': ['Grand Theft Auto V'],
'categories': ['Call of Duty: Warzone'],
},
'params': {
'skip_download': 'm3u8',
},
'expected_warnings': [r'impersonation'],
}]
def _real_extract(self, url):

View File

@@ -1,4 +1,4 @@
import datetime
import datetime as dt
import hashlib
import re
import time
@@ -185,7 +185,7 @@ class LeIE(InfoExtractor):
publish_time = parse_iso8601(self._html_search_regex(
r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
delimiter=' ', timezone=datetime.timedelta(hours=8))
delimiter=' ', timezone=dt.timedelta(hours=8))
description = self._html_search_meta('description', page, fatal=False)
return {

View File

@@ -1,4 +1,4 @@
from itertools import zip_longest
import itertools
import re
from .common import InfoExtractor
@@ -156,7 +156,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
def json2srt(self, transcript_lines, duration=None):
srt_data = ''
for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])):
for line, (line_dict, next_dict) in enumerate(itertools.zip_longest(transcript_lines, transcript_lines[1:])):
start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption']
end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1
srt_data += '%d\n%s --> %s\n%s\n\n' % (line + 1, srt_subtitles_timecode(start_time),

461
yt_dlp/extractor/loom.py Normal file
View File

@@ -0,0 +1,461 @@
import json
import textwrap
import urllib.parse
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext,
filter_dict,
get_first,
int_or_none,
parse_iso8601,
update_url,
url_or_none,
variadic,
)
from ..utils.traversal import traverse_obj
class LoomIE(InfoExtractor):
IE_NAME = 'loom'
_VALID_URL = r'https?://(?:www\.)?loom\.com/(?:share|embed)/(?P<id>[\da-f]{32})'
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=["\'](?P<url>{_VALID_URL})']
_TESTS = [{
# m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, json subs only
'url': 'https://www.loom.com/share/43d05f362f734614a2e81b4694a3a523',
'md5': 'bfc2d7e9c2e0eb4813212230794b6f42',
'info_dict': {
'id': '43d05f362f734614a2e81b4694a3a523',
'ext': 'mp4',
'title': 'A Ruler for Windows - 28 March 2022',
'uploader': 'wILLIAM PIP',
'upload_date': '20220328',
'timestamp': 1648454238,
'duration': 27,
},
}, {
# webm raw-url, mp4 transcoded-url, cdn url == transcoded-url, no subs
'url': 'https://www.loom.com/share/c43a642f815f4378b6f80a889bb73d8d',
'md5': '70f529317be8cf880fcc2c649a531900',
'info_dict': {
'id': 'c43a642f815f4378b6f80a889bb73d8d',
'ext': 'webm',
'title': 'Lilah Nielsen Intro Video',
'uploader': 'Lilah Nielsen',
'upload_date': '20200826',
'timestamp': 1598480716,
'duration': 20,
},
}, {
# m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, vtt sub and json subs
'url': 'https://www.loom.com/share/9458bcbf79784162aa62ffb8dd66201b',
'md5': '51737ec002969dd28344db4d60b9cbbb',
'info_dict': {
'id': '9458bcbf79784162aa62ffb8dd66201b',
'ext': 'mp4',
'title': 'Sharing screen with gpt-4',
'description': 'Sharing screen with GPT 4 vision model and asking questions to guide through blender.',
'uploader': 'Suneel Matham',
'chapters': 'count:3',
'upload_date': '20231109',
'timestamp': 1699518978,
'duration': 93,
},
}, {
# mpd raw-url, mp4 transcoded-url, cdn url == raw-url, no subs
'url': 'https://www.loom.com/share/24351eb8b317420289b158e4b7e96ff2',
'info_dict': {
'id': '24351eb8b317420289b158e4b7e96ff2',
'ext': 'webm',
'title': 'OMFG clown',
'description': 'md5:285c5ee9d62aa087b7e3271b08796815',
'uploader': 'MrPumkin B',
'upload_date': '20210924',
'timestamp': 1632519618,
'duration': 210,
},
'params': {'skip_download': 'dash'},
}, {
# password-protected
'url': 'https://www.loom.com/share/50e26e8aeb7940189dff5630f95ce1f4',
'md5': '5cc7655e7d55d281d203f8ffd14771f7',
'info_dict': {
'id': '50e26e8aeb7940189dff5630f95ce1f4',
'ext': 'mp4',
'title': 'iOS Mobile Upload',
'uploader': 'Simon Curran',
'upload_date': '20200520',
'timestamp': 1590000123,
'duration': 35,
},
'params': {'videopassword': 'seniorinfants2'},
}, {
# embed, transcoded-url endpoint sends empty JSON response
'url': 'https://www.loom.com/embed/ddcf1c1ad21f451ea7468b1e33917e4e',
'md5': '8488817242a0db1cb2ad0ea522553cf6',
'info_dict': {
'id': 'ddcf1c1ad21f451ea7468b1e33917e4e',
'ext': 'mp4',
'title': 'CF Reset User\'s Password',
'uploader': 'Aimee Heintz',
'upload_date': '20220707',
'timestamp': 1657216459,
'duration': 181,
},
'expected_warnings': ['Failed to parse JSON'],
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.loom.com/community/e1229802a8694a09909e8ba0fbb6d073-pg',
'md5': 'ec838cd01b576cf0386f32e1ae424609',
'info_dict': {
'id': 'e1229802a8694a09909e8ba0fbb6d073',
'ext': 'mp4',
'title': 'Rexie Jane Cimafranca - Founder\'s Presentation',
'uploader': 'Rexie Cimafranca',
'upload_date': '20230213',
'duration': 247,
'timestamp': 1676274030,
},
}]
_GRAPHQL_VARIABLES = {
'GetVideoSource': {
'acceptableMimes': ['DASH', 'M3U8', 'MP4'],
},
}
_GRAPHQL_QUERIES = {
'GetVideoSSR': textwrap.dedent('''\
query GetVideoSSR($videoId: ID!, $password: String) {
getVideo(id: $videoId, password: $password) {
__typename
... on PrivateVideo {
id
status
message
__typename
}
... on VideoPasswordMissingOrIncorrect {
id
message
__typename
}
... on RegularUserVideo {
id
__typename
createdAt
description
download_enabled
folder_id
is_protected
needs_password
owner {
display_name
__typename
}
privacy
s3_id
name
video_properties {
avgBitRate
client
camera_enabled
client_version
duration
durationMs
format
height
microphone_enabled
os
os_version
recordingClient
recording_type
recording_version
screen_type
tab_audio
trim_duration
width
__typename
}
playable_duration
source_duration
visibility
}
}
}\n'''),
'GetVideoSource': textwrap.dedent('''\
query GetVideoSource($videoId: ID!, $password: String, $acceptableMimes: [CloudfrontVideoAcceptableMime]) {
getVideo(id: $videoId, password: $password) {
... on RegularUserVideo {
id
nullableRawCdnUrl(acceptableMimes: $acceptableMimes, password: $password) {
url
__typename
}
__typename
}
__typename
}
}\n'''),
'FetchVideoTranscript': textwrap.dedent('''\
query FetchVideoTranscript($videoId: ID!, $password: String) {
fetchVideoTranscript(videoId: $videoId, password: $password) {
... on VideoTranscriptDetails {
id
video_id
source_url
captions_source_url
__typename
}
... on GenericError {
message
__typename
}
__typename
}
}\n'''),
'FetchChapters': textwrap.dedent('''\
query FetchChapters($videoId: ID!, $password: String) {
fetchVideoChapters(videoId: $videoId, password: $password) {
... on VideoChapters {
video_id
content
__typename
}
... on EmptyChaptersPayload {
content
__typename
}
... on InvalidRequestWarning {
message
__typename
}
... on Error {
message
__typename
}
__typename
}
}\n'''),
}
_APOLLO_GRAPHQL_VERSION = '0a1856c'
def _call_graphql_api(self, operations, video_id, note=None, errnote=None):
password = self.get_param('videopassword')
return self._download_json(
'https://www.loom.com/graphql', video_id, note or 'Downloading GraphQL JSON',
errnote or 'Failed to download GraphQL JSON', headers={
'Accept': 'application/json',
'Content-Type': 'application/json',
'x-loom-request-source': f'loom_web_{self._APOLLO_GRAPHQL_VERSION}',
'apollographql-client-name': 'web',
'apollographql-client-version': self._APOLLO_GRAPHQL_VERSION,
}, data=json.dumps([{
'operationName': operation_name,
'variables': {
'videoId': video_id,
'password': password,
**self._GRAPHQL_VARIABLES.get(operation_name, {}),
},
'query': self._GRAPHQL_QUERIES[operation_name],
} for operation_name in variadic(operations)], separators=(',', ':')).encode())
def _call_url_api(self, endpoint, video_id):
response = self._download_json(
f'https://www.loom.com/api/campaigns/sessions/{video_id}/{endpoint}', video_id,
f'Downloading {endpoint} JSON', f'Failed to download {endpoint} JSON', fatal=False,
headers={'Accept': 'application/json', 'Content-Type': 'application/json'},
data=json.dumps({
'anonID': str(uuid.uuid4()),
'deviceID': None,
'force_original': False, # HTTP error 401 if True
'password': self.get_param('videopassword'),
}, separators=(',', ':')).encode())
return traverse_obj(response, ('url', {url_or_none}))
def _extract_formats(self, video_id, metadata, gql_data):
formats = []
video_properties = traverse_obj(metadata, ('video_properties', {
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'acodec': ('microphone_enabled', {lambda x: 'none' if x is False else None}),
}))
def get_formats(format_url, format_id, quality):
if not format_url:
return
ext = determine_ext(format_url)
query = urllib.parse.urlparse(format_url).query
if ext == 'm3u8':
# Extract pre-merged HLS formats to avoid buggy parsing of metadata in split playlists
format_url = format_url.replace('-split.m3u8', '.m3u8')
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id=f'hls-{format_id}', fatal=False, quality=quality)
for fmt in m3u8_formats:
yield {
**fmt,
'url': update_url(fmt['url'], query=query),
'extra_param_to_segment_url': query,
}
elif ext == 'mpd':
dash_formats = self._extract_mpd_formats(
format_url, video_id, mpd_id=f'dash-{format_id}', fatal=False)
for fmt in dash_formats:
yield {
**fmt,
'extra_param_to_segment_url': query,
'quality': quality,
}
else:
yield {
'url': format_url,
'ext': ext,
'format_id': f'http-{format_id}',
'quality': quality,
**video_properties,
}
raw_url = self._call_url_api('raw-url', video_id)
formats.extend(get_formats(raw_url, 'raw', quality=1)) # original quality
transcoded_url = self._call_url_api('transcoded-url', video_id)
formats.extend(get_formats(transcoded_url, 'transcoded', quality=-1)) # transcoded quality
cdn_url = get_first(gql_data, ('data', 'getVideo', 'nullableRawCdnUrl', 'url', {url_or_none}))
# cdn_url is usually a dupe, but the raw-url/transcoded-url endpoints could return errors
valid_urls = [update_url(url, query=None) for url in (raw_url, transcoded_url) if url]
if cdn_url and update_url(cdn_url, query=None) not in valid_urls:
formats.extend(get_formats(cdn_url, 'cdn', quality=0)) # could be original or transcoded
return formats
def _real_extract(self, url):
video_id = self._match_id(url)
metadata = get_first(
self._call_graphql_api('GetVideoSSR', video_id, 'Downloading GraphQL metadata JSON'),
('data', 'getVideo', {dict})) or {}
if metadata.get('__typename') == 'VideoPasswordMissingOrIncorrect':
if not self.get_param('videopassword'):
raise ExtractorError(
'This video is password-protected, use the --video-password option', expected=True)
raise ExtractorError('Invalid video password', expected=True)
gql_data = self._call_graphql_api(['FetchChapters', 'FetchVideoTranscript', 'GetVideoSource'], video_id)
duration = traverse_obj(metadata, ('video_properties', 'duration', {int_or_none}))
return {
'id': video_id,
'duration': duration,
'chapters': self._extract_chapters_from_description(
get_first(gql_data, ('data', 'fetchVideoChapters', 'content', {str})), duration) or None,
'formats': self._extract_formats(video_id, metadata, gql_data),
'subtitles': filter_dict({
'en': traverse_obj(gql_data, (
..., 'data', 'fetchVideoTranscript',
('source_url', 'captions_source_url'), {
'url': {url_or_none},
})) or None,
}),
**traverse_obj(metadata, {
'title': ('name', {str}),
'description': ('description', {str}),
'uploader': ('owner', 'display_name', {str}),
'timestamp': ('createdAt', {parse_iso8601}),
}),
}
class LoomFolderIE(InfoExtractor):
IE_NAME = 'loom:folder'
_VALID_URL = r'https?://(?:www\.)?loom\.com/share/folder/(?P<id>[\da-f]{32})'
_TESTS = [{
# 2 subfolders, no videos in root
'url': 'https://www.loom.com/share/folder/997db4db046f43e5912f10dc5f817b5c',
'playlist_mincount': 16,
'info_dict': {
'id': '997db4db046f43e5912f10dc5f817b5c',
'title': 'Blending Lessons',
},
}, {
# only videos, no subfolders
'url': 'https://www.loom.com/share/folder/9a8a87f6b6f546d9a400c8e7575ff7f2',
'playlist_mincount': 12,
'info_dict': {
'id': '9a8a87f6b6f546d9a400c8e7575ff7f2',
'title': 'List A- a, i, o',
},
}, {
# videos in root and empty subfolder
'url': 'https://www.loom.com/share/folder/886e534218c24fd292e97e9563078cc4',
'playlist_mincount': 21,
'info_dict': {
'id': '886e534218c24fd292e97e9563078cc4',
'title': 'Medicare Agent Training videos',
},
}, {
# videos in root and videos in subfolders
'url': 'https://www.loom.com/share/folder/b72c4ecdf04745da9403926d80a40c38',
'playlist_mincount': 21,
'info_dict': {
'id': 'b72c4ecdf04745da9403926d80a40c38',
'title': 'Quick Altos Q & A Tutorials',
},
}, {
# recursive folder extraction
'url': 'https://www.loom.com/share/folder/8b458a94e0e4449b8df9ea7a68fafc4e',
'playlist_count': 23,
'info_dict': {
'id': '8b458a94e0e4449b8df9ea7a68fafc4e',
'title': 'Sezer Texting Guide',
},
}, {
# more than 50 videos in 1 folder
'url': 'https://www.loom.com/share/folder/e056a91d290d47ca9b00c9d1df56c463',
'playlist_mincount': 61,
'info_dict': {
'id': 'e056a91d290d47ca9b00c9d1df56c463',
'title': 'User Videos',
},
}, {
# many subfolders
'url': 'https://www.loom.com/share/folder/c2dde8cc67454f0e99031677279d8954',
'playlist_mincount': 75,
'info_dict': {
'id': 'c2dde8cc67454f0e99031677279d8954',
'title': 'Honors 1',
},
}, {
'url': 'https://www.loom.com/share/folder/bae17109a68146c7803454f2893c8cf8/Edpuzzle',
'only_matching': True,
}]
def _extract_folder_data(self, folder_id):
return self._download_json(
f'https://www.loom.com/v1/folders/{folder_id}', folder_id,
'Downloading folder info JSON', query={'limit': '10000'})
def _extract_folder_entries(self, folder_id, initial_folder_data=None):
folder_data = initial_folder_data or self._extract_folder_data(folder_id)
for video in traverse_obj(folder_data, ('videos', lambda _, v: v['id'])):
video_id = video['id']
yield self.url_result(
f'https://www.loom.com/share/{video_id}', LoomIE, video_id, video.get('name'))
# Recurse into subfolders
for subfolder_id in traverse_obj(folder_data, (
'folders', lambda _, v: v['id'] != folder_id, 'id', {str})):
yield from self._extract_folder_entries(subfolder_id)
def _real_extract(self, url):
playlist_id = self._match_id(url)
playlist_data = self._extract_folder_data(playlist_id)
return self.playlist_result(
self._extract_folder_entries(playlist_id, playlist_data), playlist_id,
traverse_obj(playlist_data, ('folder', 'name', {str.strip})))

View File

@@ -1,4 +1,3 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
traverse_obj,

View File

@@ -1,67 +1,153 @@
import urllib.parse
from .common import InfoExtractor
from ..utils import (
unified_strdate,
update_url_query,
urlencode_postdata,
filter_dict,
parse_iso8601,
traverse_obj,
try_call,
url_or_none,
)
class MediciIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?medici\.tv/#!/(?P<id>[^?#&]+)'
_TEST = {
'url': 'http://www.medici.tv/#!/daniel-harding-frans-helmerson-verbier-festival-music-camp',
'md5': '004c21bb0a57248085b6ff3fec72719d',
_VALID_URL = r'https?://(?:(?P<sub>www|edu)\.)?medici\.tv/[a-z]{2}/[\w.-]+/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.medici.tv/en/operas/thomas-ades-the-exterminating-angel-calixto-bieito-opera-bastille-paris',
'md5': 'd483f74e7a7a9eac0dbe152ab189050d',
'info_dict': {
'id': '3059',
'ext': 'flv',
'title': 'Daniel Harding conducts the Verbier Festival Music Camp \u2013 With Frans Helmerson',
'description': 'md5:322a1e952bafb725174fd8c1a8212f58',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20170408',
'id': '8032',
'ext': 'mp4',
'title': 'Thomas Adès\'s The Exterminating Angel',
'description': 'md5:708ae6350dadc604225b4a6e32482bab',
'thumbnail': r're:https://.+/.+\.jpg',
'upload_date': '20240304',
'timestamp': 1709561766,
'display_id': 'thomas-ades-the-exterminating-angel-calixto-bieito-opera-bastille-paris',
},
}
'expected_warnings': [r'preview'],
}, {
'url': 'https://edu.medici.tv/en/operas/wagner-lohengrin-paris-opera-kirill-serebrennikov-piotr-beczala-kwangchul-youn-johanni-van-oostrum',
'md5': '4ef3f4079a6e1c617584463a9eb84f99',
'info_dict': {
'id': '7900',
'ext': 'mp4',
'title': 'Wagner\'s Lohengrin',
'description': 'md5:a384a62937866101f86902f21752cd89',
'thumbnail': r're:https://.+/.+\.jpg',
'upload_date': '20231017',
'timestamp': 1697554771,
'display_id': 'wagner-lohengrin-paris-opera-kirill-serebrennikov-piotr-beczala-kwangchul-youn-johanni-van-oostrum',
},
'expected_warnings': [r'preview'],
}, {
'url': 'https://www.medici.tv/en/concerts/sergey-smbatyan-conducts-mansurian-chouchane-siranossian-mario-brunello',
'md5': '9dd757e53b22b2511e85ea9ea60e4815',
'info_dict': {
'id': '5712',
'ext': 'mp4',
'title': 'Sergey Smbatyan conducts Tigran Mansurian — With Chouchane Siranossian and Mario Brunello',
'thumbnail': r're:https://.+/.+\.jpg',
'description': 'md5:9411fe44c874bb10e9af288c65816e41',
'upload_date': '20200323',
'timestamp': 1584975600,
'display_id': 'sergey-smbatyan-conducts-mansurian-chouchane-siranossian-mario-brunello',
},
'expected_warnings': [r'preview'],
}, {
'url': 'https://www.medici.tv/en/ballets/carmen-ballet-choregraphie-de-jiri-bubenicek-teatro-dellopera-di-roma',
'md5': '40f5e76cb701a97a6d7ba23b62c49990',
'info_dict': {
'id': '7857',
'ext': 'mp4',
'title': 'Carmen by Jiří Bubeníček after Roland Petit, music by Bizet, de Falla, Castelnuovo-Tedesco, and Bonolis',
'thumbnail': r're:https://.+/.+\.jpg',
'description': 'md5:0f15a15611ed748020c769873e10a8bb',
'upload_date': '20240223',
'timestamp': 1708707600,
'display_id': 'carmen-ballet-choregraphie-de-jiri-bubenicek-teatro-dellopera-di-roma',
},
'expected_warnings': [r'preview'],
}, {
'url': 'https://www.medici.tv/en/documentaries/la-sonnambula-liege-2023-documentaire',
'md5': '87ff198018ce79a34757ab0dd6f21080',
'info_dict': {
'id': '7513',
'ext': 'mp4',
'title': 'La Sonnambula',
'thumbnail': r're:https://.+/.+\.jpg',
'description': 'md5:0caf9109a860fd50cd018df062a67f34',
'upload_date': '20231103',
'timestamp': 1699010830,
'display_id': 'la-sonnambula-liege-2023-documentaire',
},
'expected_warnings': [r'preview'],
}, {
'url': 'https://edu.medici.tv/en/masterclasses/yvonne-loriod-olivier-messiaen',
'md5': 'fb5dcec46d76ad20fbdbaabb01da191d',
'info_dict': {
'id': '3024',
'ext': 'mp4',
'title': 'Olivier Messiaen and Yvonne Loriod, pianists and teachers',
'thumbnail': r're:https://.+/.+\.jpg',
'description': 'md5:aab948e2f7690214b5c28896c83f1fc1',
'upload_date': '20150223',
'timestamp': 1424706608,
'display_id': 'yvonne-loriod-olivier-messiaen',
},
'skip': 'Requires authentication; preview starts in the middle',
}, {
'url': 'https://www.medici.tv/en/jazz/makaya-mccraven-la-rochelle',
'md5': '4cc279a8b06609782747c8f50beea2b3',
'info_dict': {
'id': '7922',
'ext': 'mp4',
'title': 'NEW: Makaya McCraven in La Rochelle',
'thumbnail': r're:https://.+/.+\.jpg',
'description': 'md5:b5a8aaeb6993d8ccb18bde8abb8aa8d2',
'upload_date': '20231228',
'timestamp': 1703754863,
'display_id': 'makaya-mccraven-la-rochelle',
},
'expected_warnings': [r'preview'],
}]
def _real_extract(self, url):
video_id = self._match_id(url)
display_id, subdomain = self._match_valid_url(url).group('id', 'sub')
self._request_webpage(url, display_id, 'Requesting CSRF token cookie')
# Sets csrftoken cookie
self._download_webpage(url, video_id)
MEDICI_URL = 'http://www.medici.tv/'
subdomain = 'edu-' if subdomain == 'edu' else ''
origin = f'https://{urllib.parse.urlparse(url).hostname}'
data = self._download_json(
MEDICI_URL, video_id,
data=urlencode_postdata({
'json': 'true',
'page': '/%s' % video_id,
'timezone_offset': -420,
}), headers={
'X-CSRFToken': self._get_cookies(url)['csrftoken'].value,
'X-Requested-With': 'XMLHttpRequest',
'Referer': MEDICI_URL,
'Content-Type': 'application/x-www-form-urlencoded',
})
f'https://api.medici.tv/{subdomain}satie/edito/movie-file/{display_id}/', display_id,
headers=filter_dict({
'Authorization': try_call(
lambda: urllib.parse.unquote(self._get_cookies(url)['auth._token.mAuth'].value)),
'Device-Type': 'web',
'Origin': origin,
'Referer': f'{origin}/',
'Accept': 'application/json, text/plain, */*',
}))
video = data['video']['videos']['video1']
if not traverse_obj(data, ('video', 'is_full_video')) and traverse_obj(
data, ('video', 'is_limited_by_user_access')):
self.report_warning(
'The full video is for subscribers only. Only previews will be downloaded. If you '
'have used the --cookies-from-browser option, try using the --cookies option instead')
title = video.get('nom') or data['title']
video_id = video.get('id') or video_id
formats = self._extract_f4m_formats(
update_url_query(video['url_akamai'], {
'hdcore': '3.1.0',
'plugin=aasp': '3.1.0.43.124',
}), video_id, f4m_id='hds')
description = data.get('meta_description')
thumbnail = video.get('url_thumbnail') or data.get('main_image')
upload_date = unified_strdate(data['video'].get('date'))
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
data['video']['video_url'], display_id, 'mp4')
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
'id': str(data['id']),
'display_id': display_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(data, {
'title': ('title', {str}),
'description': ('subtitle', {str}),
'thumbnail': ('picture', {url_or_none}),
'timestamp': ('date_publish', {parse_iso8601}),
}),
}

View File

@@ -1,4 +1,4 @@
from base64 import b64decode
import base64
from .common import InfoExtractor
from ..utils import (
@@ -81,7 +81,7 @@ class MicrosoftStreamIE(InfoExtractor):
'url': thumbnail_url,
}
thumb_name = url_basename(thumbnail_url)
thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4)))
thumb_name = str(base64.b64decode(thumb_name + '=' * (-len(thumb_name) % 4)))
thumb.update(parse_resolution(thumb_name))
thumbnails.append(thumb)

View File

@@ -1,5 +1,7 @@
from .common import InfoExtractor
from ..utils import UserNotLive, traverse_obj
from ..networking.exceptions import HTTPError
from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none
from ..utils.traversal import traverse_obj
class MixchIE(InfoExtractor):
@@ -25,25 +27,23 @@ class MixchIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id)
initial_js_state = self._parse_json(self._search_regex(
r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id)
if not initial_js_state.get('liveInfo'):
data = self._download_json(f'https://mixch.tv/api-web/users/{video_id}/live', video_id)
if not traverse_obj(data, ('liveInfo', {dict})):
raise UserNotLive(video_id=video_id)
return {
'id': video_id,
'title': traverse_obj(initial_js_state, ('liveInfo', 'title')),
'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')),
'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')),
'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')),
'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')),
'uploader_id': video_id,
**traverse_obj(data, {
'title': ('liveInfo', 'title', {str}),
'comment_count': ('liveInfo', 'comments', {int_or_none}),
'view_count': ('liveInfo', 'visitor', {int_or_none}),
'timestamp': ('liveInfo', 'created', {int_or_none}),
'uploader': ('broadcasterInfo', 'name', {str}),
}),
'formats': [{
'format_id': 'hls',
'url': (traverse_obj(initial_js_state, ('liveInfo', 'hls'))
or f'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_{video_id}.m3u8'),
'url': data['liveInfo']['hls'],
'ext': 'mp4',
'protocol': 'm3u8',
}],
@@ -60,22 +60,38 @@ class MixchArchiveIE(InfoExtractor):
'skip': 'paid video, no DRM. expires at Jan 23',
'info_dict': {
'id': '421',
'ext': 'mp4',
'title': '96NEKO SHOW TIME',
}
}, {
'url': 'https://mixch.tv/archive/1213',
'skip': 'paid video, no DRM. expires at Dec 31, 2023',
'info_dict': {
'id': '1213',
'ext': 'mp4',
'title': '【特別トーク番組アーカイブス】Merm4id×燐舞曲 2nd LIVE「VERSUS」',
'release_date': '20231201',
'thumbnail': str,
}
}, {
'url': 'https://mixch.tv/archive/1214',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
html5_videos = self._parse_html5_media_entries(
url, webpage.replace('video-js', 'video'), video_id, 'hls')
if not html5_videos:
self.raise_login_required(method='cookies')
infodict = html5_videos[0]
infodict.update({
try:
info_json = self._download_json(
f'https://mixch.tv/api-web/archive/{video_id}', video_id)['archive']
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
self.raise_login_required()
raise
return {
'id': video_id,
'title': self._html_search_regex(r'class="archive-title">(.+?)</', webpage, 'title')
})
return infodict
'title': traverse_obj(info_json, ('title', {str})),
'formats': self._extract_m3u8_formats(info_json['archiveURL'], video_id),
'thumbnail': traverse_obj(info_json, ('thumbnailURL', {url_or_none})),
}

View File

@@ -1,4 +1,4 @@
import datetime
import datetime as dt
import re
import urllib.parse
@@ -151,7 +151,7 @@ class MotherlessIE(InfoExtractor):
'd': 'days',
}
kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
upload_date = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
uploader_id = self._html_search_regex(

View File

@@ -4,8 +4,8 @@ import hmac
import itertools
import json
import re
import urllib.parse
import time
from urllib.parse import parse_qs, urlparse
from .common import InfoExtractor
from ..utils import (
@@ -388,7 +388,7 @@ class NaverNowIE(NaverBaseIE):
def _real_extract(self, url):
show_id = self._match_id(url)
qs = parse_qs(urlparse(url).query)
qs = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
if not self._yes_playlist(show_id, qs.get('shareHightlight')):
return self._extract_highlight(show_id, qs['shareHightlight'][0])

View File

@@ -1,9 +1,9 @@
import hashlib
import itertools
import json
import random
import re
import time
from hashlib import md5
from random import randint
from .common import InfoExtractor
from ..aes import aes_ecb_encrypt, pkcs7_padding
@@ -34,7 +34,7 @@ class NetEaseMusicBaseIE(InfoExtractor):
request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':'))
message = f'nobody{api_path}use{request_text}md5forencrypt'.encode('latin1')
msg_digest = md5(message).hexdigest()
msg_digest = hashlib.md5(message).hexdigest()
data = pkcs7_padding(list(str.encode(
f'{api_path}-36cd479b6b5-{request_text}-36cd479b6b5-{msg_digest}')))
@@ -53,7 +53,7 @@ class NetEaseMusicBaseIE(InfoExtractor):
'__csrf': '',
'os': 'pc',
'channel': 'undefined',
'requestId': f'{int(time.time() * 1000)}_{randint(0, 1000):04}',
'requestId': f'{int(time.time() * 1000)}_{random.randint(0, 1000):04}',
**traverse_obj(self._get_cookies(self._API_BASE), {
'MUSIC_U': ('MUSIC_U', {lambda i: i.value}),
})

View File

@@ -8,6 +8,7 @@ from ..utils import (
int_or_none,
join_nonempty,
parse_duration,
remove_end,
traverse_obj,
try_call,
unescapeHTML,
@@ -19,8 +20,7 @@ from ..utils import (
class NhkBaseIE(InfoExtractor):
_API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
_BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
_TYPE_REGEX = r'/(?P<type>video|audio)/'
_BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/'
def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
return self._download_json(
@@ -83,7 +83,7 @@ class NhkBaseIE(InfoExtractor):
def _extract_episode_info(self, url, episode=None):
fetch_episode = episode is None
lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id')
is_video = m_type == 'video'
is_video = m_type != 'audio'
if is_video:
episode_id = episode_id[:4] + '-' + episode_id[4:]
@@ -138,9 +138,10 @@ class NhkBaseIE(InfoExtractor):
else:
if fetch_episode:
audio_path = episode['audio']['audio']
# From https://www3.nhk.or.jp/nhkworld/common/player/radio/inline/rod.html
audio_path = remove_end(episode['audio']['audio'], '.m4a')
info['formats'] = self._extract_m3u8_formats(
'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
f'{urljoin("https://vod-stream.nhk.jp", audio_path)}/index.m3u8',
episode_id, 'm4a', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False)
for f in info['formats']:
@@ -155,9 +156,11 @@ class NhkBaseIE(InfoExtractor):
class NhkVodIE(NhkBaseIE):
# the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
_VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>video)/(?P<id>[0-9a-z]+)',
rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[0-9a-z]+)']
_VALID_URL = [
rf'{NhkBaseIE._BASE_URL_REGEX}shows/(?:(?P<type>video)/)?(?P<id>\d{{4}}[\da-z]\d+)/?(?:$|[?#])',
rf'{NhkBaseIE._BASE_URL_REGEX}(?:ondemand|shows)/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[\da-z]+)',
rf'{NhkBaseIE._BASE_URL_REGEX}ondemand/(?P<type>video)/(?P<id>\d{{4}}[\da-z]\d+)', # deprecated
]
# Content available only for a limited period of time. Visit
# https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
_TESTS = [{
@@ -167,17 +170,16 @@ class NhkVodIE(NhkBaseIE):
'ext': 'mp4',
'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead',
'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6',
'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463',
'thumbnail': r're:https://.+/.+\.jpg',
'episode': 'The Tohoku Shinkansen: Full Speed Ahead',
'series': 'Japan Railway Journal',
'modified_timestamp': 1694243656,
'modified_timestamp': 1707217907,
'timestamp': 1681428600,
'release_timestamp': 1693883728,
'duration': 1679,
'upload_date': '20230413',
'modified_date': '20230909',
'modified_date': '20240206',
'release_date': '20230905',
},
}, {
# video clip
@@ -188,15 +190,15 @@ class NhkVodIE(NhkBaseIE):
'ext': 'mp4',
'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU',
'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed',
'thumbnail': r're:https://.+/.+\.jpg',
'series': 'Dining with the Chef',
'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
'duration': 148,
'upload_date': '20190816',
'release_date': '20230902',
'release_timestamp': 1693619292,
'modified_timestamp': 1694168033,
'modified_date': '20230908',
'modified_timestamp': 1707217907,
'modified_date': '20240206',
'timestamp': 1565997540,
},
}, {
@@ -208,7 +210,7 @@ class NhkVodIE(NhkBaseIE):
'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines',
'series': 'Living in Japan',
'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab',
'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545',
'thumbnail': r're:https://.+/.+\.jpg',
'episode': 'Tips for Travelers to Japan / Ramen Vending Machines'
},
}, {
@@ -245,7 +247,7 @@ class NhkVodIE(NhkBaseIE):
'title': 'おはよう日本7時台 - 10月8日放送',
'series': 'おはよう日本7時台',
'episode': '10月8日放送',
'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4',
'thumbnail': r're:https://.+/.+\.jpg',
'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0',
},
'skip': 'expires 2023-10-15',
@@ -255,17 +257,100 @@ class NhkVodIE(NhkBaseIE):
'info_dict': {
'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552',
'ext': 'mp4',
'title': 'Barakan Discovers AMAMI OSHIMA: Isson\'s Treasure Island',
'title': 'Barakan Discovers - AMAMI OSHIMA: Isson\'s Treasure Isla',
'description': 'md5:5db620c46a0698451cc59add8816b797',
'thumbnail': 'md5:67d9ff28009ba379bfa85ad1aaa0e2bd',
'thumbnail': r're:https://.+/.+\.jpg',
'release_date': '20230905',
'timestamp': 1690103400,
'duration': 2939,
'release_timestamp': 1693898699,
'modified_timestamp': 1698057495,
'modified_date': '20231023',
'upload_date': '20230723',
'modified_timestamp': 1707217907,
'modified_date': '20240206',
'episode': 'AMAMI OSHIMA: Isson\'s Treasure Isla',
'series': 'Barakan Discovers',
},
}, {
# /ondemand/video/ url with alphabetical character in 5th position of id
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a07/',
'info_dict': {
'id': 'nw_c_en_9999-a07',
'ext': 'mp4',
'episode': 'Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]',
'series': 'Mini-Dramas on SDGs',
'modified_date': '20240206',
'title': 'Mini-Dramas on SDGs - Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]',
'description': 'md5:3f9dcb4db22fceb675d90448a040d3f6',
'timestamp': 1621962360,
'duration': 189,
'release_date': '20230903',
'modified_timestamp': 1707217907,
'upload_date': '20210525',
'thumbnail': r're:https://.+/.+\.jpg',
'release_timestamp': 1693713487,
},
}, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999d17/',
'info_dict': {
'id': 'nw_c_en_9999-d17',
'ext': 'mp4',
'title': 'Flowers of snow blossom - The 72 Pentads of Yamato',
'description': 'Todays focus: Snow',
'release_timestamp': 1693792402,
'release_date': '20230904',
'upload_date': '20220128',
'timestamp': 1643370960,
'thumbnail': r're:https://.+/.+\.jpg',
'duration': 136,
'series': '',
'modified_date': '20240206',
'modified_timestamp': 1707217907,
},
}, {
# new /shows/ url format
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/2032307/',
'info_dict': {
'id': 'nw_vod_v_en_2032_307_20240321113000_01_1710990282',
'ext': 'mp4',
'title': 'Japanology Plus - 20th Anniversary Special Part 1',
'description': 'md5:817d41fc8e54339ad2a916161ea24faf',
'episode': '20th Anniversary Special Part 1',
'series': 'Japanology Plus',
'thumbnail': r're:https://.+/.+\.jpg',
'duration': 1680,
'timestamp': 1711020600,
'upload_date': '20240321',
'release_timestamp': 1711022683,
'release_date': '20240321',
'modified_timestamp': 1711031012,
'modified_date': '20240321',
},
}, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/3020025/',
'info_dict': {
'id': 'nw_vod_v_en_3020_025_20230325144000_01_1679723944',
'ext': 'mp4',
'title': '100 Ideas to Save the World - Working Styles Evolve',
'description': 'md5:9e6c7778eaaf4f7b4af83569649f84d9',
'episode': 'Working Styles Evolve',
'series': '100 Ideas to Save the World',
'thumbnail': r're:https://.+/.+\.jpg',
'duration': 899,
'upload_date': '20230325',
'timestamp': 1679755200,
'release_date': '20230905',
'release_timestamp': 1693880540,
'modified_date': '20240206',
'modified_timestamp': 1707217907,
},
}, {
# new /shows/audio/ url format
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/livinginjapan-20231001-1/',
'only_matching': True,
}, {
# valid url even if can't be found in wild; support needed for clip entries extraction
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/9999o80/',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -273,18 +358,21 @@ class NhkVodIE(NhkBaseIE):
class NhkVodProgramIE(NhkBaseIE):
_VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P<id>\w+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?'
_VALID_URL = rf'''(?x)
{NhkBaseIE._BASE_URL_REGEX}(?:shows|tv)/
(?:(?P<type>audio)/programs/)?(?P<id>\w+)/?
(?:\?(?:[^#]+&)?type=(?P<episode_type>clip|(?:radio|tv)Episode))?'''
_TESTS = [{
# video program episodes
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo',
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/sumo/',
'info_dict': {
'id': 'sumo',
'title': 'GRAND SUMO Highlights',
'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf',
},
'playlist_mincount': 0,
'playlist_mincount': 1,
}, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/',
'info_dict': {
'id': 'japanrailway',
'title': 'Japan Railway Journal',
@@ -293,40 +381,68 @@ class NhkVodProgramIE(NhkBaseIE):
'playlist_mincount': 12,
}, {
# video program clips
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/?type=clip',
'info_dict': {
'id': 'japanrailway',
'title': 'Japan Railway Journal',
'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f',
},
'playlist_mincount': 5,
}, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
'only_matching': True,
'playlist_mincount': 12,
}, {
# audio program
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/programs/livinginjapan/',
'info_dict': {
'id': 'livinginjapan',
'title': 'Living in Japan',
'description': 'md5:665bb36ec2a12c5a7f598ee713fc2b54',
},
'playlist_mincount': 12,
}, {
# /tv/ program url
'url': 'https://www3.nhk.or.jp/nhkworld/en/tv/designtalksplus/',
'info_dict': {
'id': 'designtalksplus',
'title': 'DESIGN TALKS plus',
'description': 'md5:47b3b3a9f10d4ac7b33b53b70a7d2837',
},
'playlist_mincount': 20,
}, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/10yearshayaomiyazaki/',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if NhkVodIE.suitable(url) else super().suitable(url)
def _extract_meta_from_class_elements(self, class_values, html):
for class_value in class_values:
if value := clean_html(get_element_by_class(class_value, html)):
return value
def _real_extract(self, url):
lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type')
episodes = self._call_api(
program_id, lang, m_type == 'video', False, episode_type == 'clip')
program_id, lang, m_type != 'audio', False, episode_type == 'clip')
entries = []
for episode in episodes:
episode_path = episode.get('url')
if not episode_path:
continue
entries.append(self._extract_episode_info(
urljoin(url, episode_path), episode))
def entries():
for episode in episodes:
if episode_path := episode.get('url'):
yield self._extract_episode_info(urljoin(url, episode_path), episode)
html = self._download_webpage(url, program_id)
program_title = clean_html(get_element_by_class('p-programDetail__title', html))
program_description = clean_html(get_element_by_class('p-programDetail__text', html))
program_title = self._extract_meta_from_class_elements([
'p-programDetail__title', # /ondemand/program/
'pProgramHero__logoText', # /shows/
'tAudioProgramMain__title', # /shows/audio/programs/
'p-program-name'], html) # /tv/
program_description = self._extract_meta_from_class_elements([
'p-programDetail__text', # /ondemand/program/
'pProgramHero__description', # /shows/
'tAudioProgramMain__info', # /shows/audio/programs/
'p-program-description'], html) # /tv/
return self.playlist_result(entries, program_id, program_title, program_description)
return self.playlist_result(entries(), program_id, program_title, program_description)
class NhkForSchoolBangumiIE(InfoExtractor):

View File

@@ -1,11 +1,10 @@
import datetime
import datetime as dt
import functools
import itertools
import json
import re
import time
from urllib.parse import urlparse
import urllib.parse
from .common import InfoExtractor, SearchInfoExtractor
from ..networking import Request
@@ -820,12 +819,12 @@ class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor):
'playlist_mincount': 1610,
}]
_START_DATE = datetime.date(2007, 1, 1)
_START_DATE = dt.date(2007, 1, 1)
_RESULTS_PER_PAGE = 32
_MAX_PAGES = 50
def _entries(self, url, item_id, start_date=None, end_date=None):
start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date()
start_date, end_date = start_date or self._START_DATE, end_date or dt.datetime.now().date()
# If the last page has a full page of videos, we need to break down the query interval further
last_page_len = len(list(self._get_entries_for_date(
@@ -957,7 +956,7 @@ class NiconicoLiveIE(InfoExtractor):
'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9',
})
hostname = remove_start(urlparse(urlh.url).hostname, 'sp.')
hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.')
latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
if latency not in self._KNOWN_LATENCY:
latency = 'high'

View File

@@ -1,8 +1,8 @@
import calendar
import json
import datetime as dt
import functools
from datetime import datetime, timezone
from random import random
import json
import random
from .common import InfoExtractor
from ..compat import (
@@ -243,7 +243,7 @@ class PanoptoIE(PanoptoBaseIE):
invocation_id = delivery_info.get('InvocationId')
stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str)
if invocation_id and stream_id and duration:
timestamp_str = f'/Date({calendar.timegm(datetime.now(timezone.utc).timetuple())}000)/'
timestamp_str = f'/Date({calendar.timegm(dt.datetime.now(dt.timezone.utc).timetuple())}000)/'
data = {
'streamRequests': [
{
@@ -415,7 +415,7 @@ class PanoptoIE(PanoptoBaseIE):
'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), expected_type=lambda x: x or None),
'timestamp': session_start_time - 11640000000 if session_start_time else None,
'duration': delivery.get('Duration'),
'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}',
'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random.random()}',
'average_rating': delivery.get('AverageRating'),
'chapters': self._extract_chapters(timestamps),
'uploader': delivery.get('OwnerDisplayName') or None,

View File

@@ -92,7 +92,7 @@ class PatreonIE(PatreonBaseIE):
'thumbnail': 're:^https?://.*$',
'upload_date': '20150211',
'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364',
'uploader_id': 'TraciJHines',
'uploader_id': '@TraciHinesMusic',
'categories': ['Entertainment'],
'duration': 282,
'view_count': int,
@@ -106,8 +106,10 @@ class PatreonIE(PatreonBaseIE):
'availability': 'public',
'channel_follower_count': int,
'playable_in_embed': True,
'uploader_url': 'http://www.youtube.com/user/TraciJHines',
'uploader_url': 'https://www.youtube.com/@TraciHinesMusic',
'comment_count': int,
'channel_is_verified': True,
'chapters': 'count:4',
},
'params': {
'noplaylist': True,
@@ -176,6 +178,27 @@ class PatreonIE(PatreonBaseIE):
'uploader_url': 'https://www.patreon.com/thenormies',
},
'skip': 'Patron-only content',
}, {
# dead vimeo and embed URLs, need to extract post_file
'url': 'https://www.patreon.com/posts/hunter-x-hunter-34007913',
'info_dict': {
'id': '34007913',
'ext': 'mp4',
'title': 'Hunter x Hunter | Kurapika DESTROYS Uvogin!!!',
'like_count': int,
'uploader': 'YaBoyRoshi',
'timestamp': 1581636833,
'channel_url': 'https://www.patreon.com/yaboyroshi',
'thumbnail': r're:^https?://.*$',
'tags': ['Hunter x Hunter'],
'uploader_id': '14264111',
'comment_count': int,
'channel_follower_count': int,
'description': 'Kurapika is a walking cheat code!',
'upload_date': '20200213',
'channel_id': '2147162',
'uploader_url': 'https://www.patreon.com/yaboyroshi',
},
}]
def _real_extract(self, url):
@@ -250,20 +273,13 @@ class PatreonIE(PatreonBaseIE):
v_url = url_or_none(compat_urllib_parse_unquote(
self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False)))
if v_url:
return {
**info,
'_type': 'url_transparent',
'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'),
'ie_key': 'Vimeo',
}
v_url = VimeoIE._smuggle_referrer(v_url, 'https://patreon.com')
if self._request_webpage(v_url, video_id, 'Checking Vimeo embed URL', fatal=False, errnote=False):
return self.url_result(v_url, VimeoIE, url_transparent=True, **info)
embed_url = try_get(attributes, lambda x: x['embed']['url'])
if embed_url:
return {
**info,
'_type': 'url',
'url': embed_url,
}
if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
return self.url_result(embed_url, **info)
post_file = traverse_obj(attributes, 'post_file')
if post_file:

View File

@@ -1,5 +1,5 @@
from uuid import uuid4
import json
import uuid
from .common import InfoExtractor
from ..utils import (
@@ -51,7 +51,7 @@ class PolsatGoIE(InfoExtractor):
}
def _call_api(self, endpoint, media_id, method, params):
rand_uuid = str(uuid4())
rand_uuid = str(uuid.uuid4())
res = self._download_json(
f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id,
note=f'Downloading {method} JSON metadata',

View File

@@ -1,5 +1,6 @@
import datetime as dt
import json
from urllib.parse import unquote
import urllib.parse
from .common import InfoExtractor
from ..compat import functools
@@ -114,7 +115,7 @@ class Pr0grammIE(InfoExtractor):
cookies = self._get_cookies(self.BASE_URL)
if 'me' not in cookies:
self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
if traverse_obj(cookies, ('me', {lambda x: x.value}, {urllib.parse.unquote}, {json.loads}, 'verified')):
flags |= 0b00110
return flags
@@ -196,6 +197,7 @@ class Pr0grammIE(InfoExtractor):
'like_count': ('up', {int}),
'dislike_count': ('down', {int}),
'timestamp': ('created', {int}),
'upload_date': ('created', {int}, {dt.date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}),
'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)})
}),
}

View File

@@ -1,6 +1,6 @@
import hashlib
import re
from hashlib import sha1
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
@@ -42,7 +42,7 @@ class ProSiebenSat1BaseIE(InfoExtractor):
'Downloading protocols JSON',
headers=self.geo_verification_headers(), query={
'access_id': self._ACCESS_ID,
'client_token': sha1((raw_ct).encode()).hexdigest(),
'client_token': hashlib.sha1((raw_ct).encode()).hexdigest(),
'video_id': clip_id,
}, fatal=False, expected_status=(403,)) or {}
error = protocols.get('error') or {}
@@ -53,7 +53,7 @@ class ProSiebenSat1BaseIE(InfoExtractor):
urls = (self._download_json(
self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={
'access_id': self._ACCESS_ID,
'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(),
'client_token': hashlib.sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(),
'protocols': self._SUPPORTED_PROTOCOLS,
'server_token': server_token,
'video_id': clip_id,
@@ -77,7 +77,7 @@ class ProSiebenSat1BaseIE(InfoExtractor):
if not formats:
source_ids = [compat_str(source['id']) for source in video['sources']]
client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
client_id = self._SALT[:2] + hashlib.sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
sources = self._download_json(
'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id,
@@ -96,7 +96,7 @@ class ProSiebenSat1BaseIE(InfoExtractor):
return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
for source_id in source_ids:
client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
client_id = self._SALT[:2] + hashlib.sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
urls = self._download_json(
'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id,
clip_id, 'Downloading urls JSON', fatal=False, query={

View File

@@ -1,18 +1,14 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
traverse_obj,
unescapeHTML,
)
import itertools
from urllib.parse import urlencode
import urllib.parse
from .common import InfoExtractor
from ..utils import clean_html, traverse_obj, unescapeHTML
class RadioKapitalBaseIE(InfoExtractor):
def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}):
return self._download_json(
f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}',
f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urllib.parse.urlencode(qs)}',
video_id, note=note)
def _parse_episode(self, data):

View File

@@ -1,8 +1,8 @@
import datetime as dt
import itertools
import json
import re
import urllib.parse
from datetime import datetime
from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
@@ -156,7 +156,7 @@ class RokfinIE(InfoExtractor):
self.raise_login_required('This video is only available to premium users', True, method='cookies')
elif scheduled:
self.raise_no_formats(
f'Stream is offline; scheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}',
f'Stream is offline; scheduled for {dt.datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}',
video_id=video_id, expected=True)
uploader = traverse_obj(metadata, ('createdBy', 'username'), ('creator', 'username'))

View File

@@ -1,4 +1,4 @@
import datetime
import datetime as dt
from .common import InfoExtractor
from .redge import RedCDNLivxIE
@@ -13,16 +13,16 @@ from ..utils.traversal import traverse_obj
def is_dst(date):
last_march = datetime.datetime(date.year, 3, 31)
last_october = datetime.datetime(date.year, 10, 31)
last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7)
last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7)
last_march = dt.datetime(date.year, 3, 31)
last_october = dt.datetime(date.year, 10, 31)
last_sunday_march = last_march - dt.timedelta(days=last_march.isoweekday() % 7)
last_sunday_october = last_october - dt.timedelta(days=last_october.isoweekday() % 7)
return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3)
def rfc3339_to_atende(date):
date = datetime.datetime.fromisoformat(date)
date = date + datetime.timedelta(hours=1 if is_dst(date) else 0)
date = dt.datetime.fromisoformat(date)
date = date + dt.timedelta(hours=1 if is_dst(date) else 0)
return int((date.timestamp() - 978307200) * 1000)

View File

@@ -0,0 +1,112 @@
import json
import urllib.parse
from .common import InfoExtractor
from ..utils import determine_ext, int_or_none, url_or_none
from ..utils.traversal import traverse_obj
class SharePointIE(InfoExtractor):
_BASE_URL_RE = r'https?://[\w-]+\.sharepoint\.com/'
_VALID_URL = [
rf'{_BASE_URL_RE}:v:/[a-z]/(?:[^/?#]+/)*(?P<id>[^/?#]{{46}})/?(?:$|[?#])',
rf'{_BASE_URL_RE}(?!:v:)(?:[^/?#]+/)*stream\.aspx\?(?:[^#]+&)?id=(?P<id>[^&#]+)',
]
_TESTS = [{
'url': 'https://lut-my.sharepoint.com/:v:/g/personal/juha_eerola_student_lab_fi/EUrAmrktb4ZMhUcY9J2PqMEBD_9x_l0DyYWVgAvp-TTOMw?e=ZpQOOw',
'md5': '2950821d0d4937a0a76373782093b435',
'info_dict': {
'id': '01EQRS7EKKYCNLSLLPQZGIKRYY6SOY7KGB',
'display_id': 'EUrAmrktb4ZMhUcY9J2PqMEBD_9x_l0DyYWVgAvp-TTOMw',
'ext': 'mp4',
'title': 'CmvpJST',
'duration': 54.567,
'thumbnail': r're:https://.+/thumbnail',
'uploader_id': '8dcec565-a956-4b91-95e5-bacfb8bc015f',
},
}, {
'url': 'https://greaternyace.sharepoint.com/:v:/s/acementornydrive/ETski5eAfNVEoPRZUAyy1wEBpLgVFYWso5bjbZjfBLlPUg?e=PQUfVb',
'md5': 'c496a01644223273bff12e93e501afd1',
'info_dict': {
'id': '01QI4AVTZ3ESFZPAD42VCKB5CZKAGLFVYB',
'display_id': 'ETski5eAfNVEoPRZUAyy1wEBpLgVFYWso5bjbZjfBLlPUg',
'ext': 'mp4',
'title': '930103681233985536',
'duration': 3797.326,
'thumbnail': r're:https://.+/thumbnail',
},
}, {
'url': 'https://lut-my.sharepoint.com/personal/juha_eerola_student_lab_fi/_layouts/15/stream.aspx?id=%2Fpersonal%2Fjuha_eerola_student_lab_fi%2FDocuments%2FM-DL%2FCmvpJST.mp4&ga=1&referrer=StreamWebApp.Web&referrerScenario=AddressBarCopied.view',
'info_dict': {
'id': '01EQRS7EKKYCNLSLLPQZGIKRYY6SOY7KGB',
'display_id': '/personal/juha_eerola_student_lab_fi/Documents/M-DL/CmvpJST.mp4',
'ext': 'mp4',
'title': 'CmvpJST',
'duration': 54.567,
'thumbnail': r're:https://.+/thumbnail',
'uploader_id': '8dcec565-a956-4b91-95e5-bacfb8bc015f',
},
'skip': 'Session cookies needed',
}, {
'url': 'https://izoobasisschool.sharepoint.com/:v:/g/Eaqleq8COVBIvIPvod0U27oBypC6aWOkk8ptuDpmJ6arHw',
'only_matching': True,
}, {
'url': 'https://uskudaredutr-my.sharepoint.com/:v:/g/personal/songul_turkaydin_uskudar_edu_tr/EbTf-VRUIbtGuIN73tx1MuwBCHBOmNcWNqSLw61Fd2_o0g?e=n5Vkof',
'only_matching': True,
}, {
'url': 'https://epam-my.sharepoint.com/:v:/p/dzmitry_tamashevich/Ec4ZOs-rATZHjFYZWVxjczEB649FCoYFKDV_x3RxZiWAGA?e=4hswgA',
'only_matching': True,
}, {
'url': 'https://microsoft.sharepoint.com/:v:/t/MicrosoftSPARKRecordings-MSFTInternal/EWCyeqByVWBAt8wDvNZdV-UB0BvU5YVbKm0UHgdrUlI6dg?e=QbPck6',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = urllib.parse.unquote(self._match_id(url))
webpage, urlh = self._download_webpage_handle(url, display_id)
if urllib.parse.urlparse(urlh.url).hostname == 'login.microsoftonline.com':
self.raise_login_required(
'Session cookies are required for this URL and can be passed '
'with the --cookies option. The --cookies-from-browser option will not work', method=None)
video_data = self._search_json(r'g_fileInfo\s*=', webpage, 'player config', display_id)
video_id = video_data['VroomItemId']
parsed_url = urllib.parse.urlparse(video_data['.transformUrl'])
base_media_url = urllib.parse.urlunparse(parsed_url._replace(
path=urllib.parse.urljoin(f'{parsed_url.path}/', '../videomanifest'),
query=urllib.parse.urlencode({
**urllib.parse.parse_qs(parsed_url.query),
'cTag': video_data['.ctag'],
'action': 'Access',
'part': 'index',
}, doseq=True)))
# Web player adds more params to the format URLs but we still get all formats without them
formats = self._extract_mpd_formats(
base_media_url, video_id, mpd_id='dash', query={'format': 'dash'}, fatal=False)
for hls_type in ('hls', 'hls-vnext'):
formats.extend(self._extract_m3u8_formats(
base_media_url, video_id, 'mp4', m3u8_id=hls_type,
query={'format': hls_type}, fatal=False, quality=-2))
if video_url := traverse_obj(video_data, ('downloadUrl', {url_or_none})):
formats.append({
'url': video_url,
'ext': determine_ext(video_data.get('extension') or video_data.get('name')),
'quality': 1,
'format_id': 'source',
'filesize': int_or_none(video_data.get('size')),
'vcodec': 'none' if video_data.get('isAudio') is True else None,
})
return {
'id': video_id,
'formats': formats,
'title': video_data.get('title') or video_data.get('displayName'),
'display_id': display_id,
'uploader_id': video_data.get('authorId'),
'duration': traverse_obj(video_data, (
'MediaServiceFastMetadata', {json.loads}, 'media', 'duration', {lambda x: x / 10000000})),
'thumbnail': url_or_none(video_data.get('thumbnailUrl')),
}

View File

@@ -1,4 +1,4 @@
import datetime
import datetime as dt
import itertools
import json
import math
@@ -94,7 +94,7 @@ class SonyLIVIE(InfoExtractor):
'mobileNumber': username,
'channelPartnerID': 'MSMIND',
'country': 'IN',
'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'),
'timestamp': dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'),
'otpSize': 6,
'loginType': 'REGISTERORSIGNIN',
'isMobileMandatory': True,
@@ -111,7 +111,7 @@ class SonyLIVIE(InfoExtractor):
'otp': self._get_tfa_info('OTP'),
'dmaId': 'IN',
'ageConfirmation': True,
'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'),
'timestamp': dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'),
'isMobileMandatory': True,
}).encode())
if otp_verify_json['resultCode'] == 'KO':

View File

@@ -1,30 +1,27 @@
import itertools
import re
import json
# import random
import re
from .common import (
InfoExtractor,
SearchInfoExtractor
)
from .common import InfoExtractor, SearchInfoExtractor
from ..compat import compat_str
from ..networking import HEADRequest, Request
from ..networking import HEADRequest
from ..networking.exceptions import HTTPError
from ..utils import (
error_to_compat_str,
KNOWN_EXTENSIONS,
ExtractorError,
error_to_compat_str,
float_or_none,
int_or_none,
KNOWN_EXTENSIONS,
mimetype2ext,
parse_qs,
str_or_none,
try_get,
try_call,
unified_timestamp,
update_url_query,
url_or_none,
urlhandle_detect_ext,
)
from ..utils.traversal import traverse_obj
class SoundcloudEmbedIE(InfoExtractor):
@@ -54,7 +51,6 @@ class SoundcloudBaseIE(InfoExtractor):
_API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
_API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
_API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
_access_token = None
_HEADERS = {}
_IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
@@ -112,21 +108,31 @@ class SoundcloudBaseIE(InfoExtractor):
def _initialize_pre_login(self):
self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
def _perform_login(self, username, password):
if username != 'oauth':
self.report_warning(
'Login using username and password is not currently supported. '
'Use "--username oauth --password <oauth_token>" to login using an oauth token')
self._access_token = password
query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
payload = {'session': {'access_token': self._access_token}}
token_verification = Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
if response is not False:
self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
def _verify_oauth_token(self, token):
if self._request_webpage(
self._API_VERIFY_AUTH_TOKEN % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
None, note='Verifying login token...', fatal=False,
data=json.dumps({'session': {'access_token': token}}).encode()):
self._HEADERS['Authorization'] = f'OAuth {token}'
self.report_login()
else:
self.report_warning('Provided authorization token seems to be invalid. Continue as guest')
self.report_warning('Provided authorization token is invalid. Continuing as guest')
def _real_initialize(self):
if self._HEADERS:
return
if token := try_call(lambda: self._get_cookies(self._BASE_URL)['oauth_token'].value):
self._verify_oauth_token(token)
def _perform_login(self, username, password):
if username != 'oauth':
raise ExtractorError(
'Login using username and password is not currently supported. '
'Use "--username oauth --password <oauth_token>" to login using an oauth token, '
f'or else {self._login_hint(method="cookies")}', expected=True)
if self._HEADERS:
return
self._verify_oauth_token(password)
r'''
def genDevId():
@@ -147,14 +153,17 @@ class SoundcloudBaseIE(InfoExtractor):
'user_agent': self._USER_AGENT
}
query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
response = self._download_json(login, None)
self._access_token = response.get('session').get('access_token')
if not self._access_token:
self.report_warning('Unable to get access token, login may has failed')
else:
self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
response = self._download_json(
self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
None, note='Verifying login token...', fatal=False,
data=json.dumps(payload).encode())
if token := traverse_obj(response, ('session', 'access_token', {str})):
self._HEADERS['Authorization'] = f'OAuth {token}'
self.report_login()
return
raise ExtractorError('Unable to get access token, login may have failed', expected=True)
'''
# signature generation
@@ -217,6 +226,7 @@ class SoundcloudBaseIE(InfoExtractor):
'filesize': int_or_none(urlh.headers.get('Content-Length')),
'url': format_url,
'quality': 10,
'format_note': 'Original',
})
def invalid_url(url):
@@ -233,9 +243,13 @@ class SoundcloudBaseIE(InfoExtractor):
format_id_list.append(protocol)
ext = f.get('ext')
if ext == 'aac':
f['abr'] = '256'
f.update({
'abr': 256,
'quality': 5,
'format_note': 'Premium',
})
for k in ('ext', 'abr'):
v = f.get(k)
v = str_or_none(f.get(k))
if v:
format_id_list.append(v)
preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url'])
@@ -256,16 +270,25 @@ class SoundcloudBaseIE(InfoExtractor):
formats.append(f)
# New API
transcodings = try_get(
info, lambda x: x['media']['transcodings'], list) or []
for t in transcodings:
if not isinstance(t, dict):
continue
format_url = url_or_none(t.get('url'))
if not format_url:
continue
stream = None if extract_flat else self._download_json(
format_url, track_id, query=query, fatal=False, headers=self._HEADERS)
for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']))):
if extract_flat:
break
format_url = t['url']
stream = None
for retry in self.RetryManager(fatal=False):
try:
stream = self._download_json(format_url, track_id, query=query, headers=self._HEADERS)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 429:
self.report_warning(
'You have reached the API rate limit, which is ~600 requests per '
'10 minutes. Use the --extractor-retries and --retry-sleep options '
'to configure an appropriate retry count and wait time', only_once=True)
retry.error = e.cause
else:
self.report_warning(e.msg)
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('url'))

View File

@@ -1,8 +1,7 @@
from __future__ import annotations
import functools
import json
from functools import partial
from textwrap import dedent
import textwrap
from .common import InfoExtractor
from ..utils import ExtractorError, format_field, int_or_none, parse_iso8601
@@ -10,7 +9,7 @@ from ..utils.traversal import traverse_obj
def _fmt_url(url):
return partial(format_field, template=url, default=None)
return functools.partial(format_field, template=url, default=None)
class TelewebionIE(InfoExtractor):
@@ -88,7 +87,7 @@ class TelewebionIE(InfoExtractor):
if not video_id.startswith('0x'):
video_id = hex(int(video_id))
episode_data = self._call_graphql_api('getEpisodeDetail', video_id, dedent('''
episode_data = self._call_graphql_api('getEpisodeDetail', video_id, textwrap.dedent('''
queryEpisode(filter: {EpisodeID: $EpisodeId}, first: 1) {
title
program {
@@ -127,7 +126,7 @@ class TelewebionIE(InfoExtractor):
'formats': (
'channel', 'descriptor', {str},
{_fmt_url(f'https://cdna.telewebion.com/%s/episode/{video_id}/playlist.m3u8')},
{partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}),
{functools.partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}),
}))
info_dict['id'] = video_id
return info_dict

View File

@@ -1,7 +1,7 @@
import base64
import datetime as dt
import functools
import itertools
from datetime import datetime
from .common import InfoExtractor
from ..networking import HEADRequest
@@ -70,7 +70,7 @@ class TenPlayIE(InfoExtractor):
username, password = self._get_login_info()
if username is None or password is None:
self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.')
_timestamp = datetime.now().strftime('%Y%m%d000000')
_timestamp = dt.datetime.now().strftime('%Y%m%d000000')
_auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii')
data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={
'X-Network-Ten-Auth': _auth_header,

View File

@@ -1,5 +1,6 @@
import json
from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from .zype import ZypeIE
from ..networking import HEADRequest
@@ -8,6 +9,7 @@ from ..utils import (
ExtractorError,
filter_dict,
parse_qs,
smuggle_url,
try_call,
urlencode_postdata,
)
@@ -17,23 +19,43 @@ class ThisOldHouseIE(InfoExtractor):
_NETRC_MACHINE = 'thisoldhouse'
_VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/?#]+/)?\d+)/(?P<id>[^/?#]+)'
_TESTS = [{
# Unresolved Brightcove URL embed (formerly Zype), free
'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench',
'info_dict': {
'id': '5dcdddf673c3f956ef5db202',
'id': '6325298523112',
'ext': 'mp4',
'title': 'How to Build a Storage Bench',
'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
'timestamp': 1442548800,
'upload_date': '20150918',
'duration': 674,
'view_count': int,
'average_rating': 0,
'thumbnail': r're:^https?://.*\.jpg\?\d+$',
'display_id': 'how-to-build-a-storage-bench',
'timestamp': 1681793639,
'upload_date': '20230418',
'duration': 674.54,
'tags': 'count:11',
'uploader_id': '6314471934001',
'thumbnail': r're:^https?://.*\.jpg',
},
'params': {
'skip_download': True,
},
}, {
# Brightcove embed, authwalled
'url': 'https://www.thisoldhouse.com/glen-ridge-generational/99537/s45-e17-multi-generational',
'info_dict': {
'id': '6349675446112',
'ext': 'mp4',
'title': 'E17 | Glen Ridge Generational | Multi-Generational',
'description': 'md5:53c6bc2e8031f3033d693d9a3563222c',
'timestamp': 1711382202,
'upload_date': '20240325',
'duration': 1422.229,
'tags': 'count:13',
'uploader_id': '6314471934001',
'thumbnail': r're:^https?://.*\.jpg',
},
'expected_warnings': ['Login with password is not supported for this website'],
'params': {
'skip_download': True,
},
'skip': 'Requires subscription',
}, {
# Page no longer has video
'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins',
@@ -98,7 +120,15 @@ class ThisOldHouseIE(InfoExtractor):
video_url, video_id = self._search_regex(
r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]',
webpage, 'video url', group=(1, 2))
video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url
webpage, 'zype url', group=(1, 2), default=(None, None))
if video_url:
video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url
return self.url_result(video_url, ZypeIE, video_id)
return self.url_result(video_url, ZypeIE, video_id)
video_url, video_id = self._search_regex([
r'<iframe[^>]+src=[\'"]((?:https?:)?//players\.brightcove\.net/\d+/\w+/index\.html\?videoId=(\d+))',
r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)thisoldhouse\.com/videos/brightcove/(\d+))'],
webpage, 'iframe url', group=(1, 2))
if not parse_qs(video_url).get('videoId'):
video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Brightcove URL').url
return self.url_result(smuggle_url(video_url, {'referrer': url}), BrightcoveNewIE, video_id)

View File

@@ -4,6 +4,7 @@ import random
import re
import string
import time
import uuid
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
@@ -30,19 +31,65 @@ from ..utils import (
class TikTokBaseIE(InfoExtractor):
_APP_VERSIONS = [('26.1.3', '260103'), ('26.1.2', '260102'), ('26.1.1', '260101'), ('25.6.2', '250602')]
_WORKING_APP_VERSION = None
_APP_NAME = 'trill'
_AID = 1180
_UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
_WEBPAGE_HOST = 'https://www.tiktok.com/'
QUALITIES = ('360p', '540p', '720p', '1080p')
_APP_INFO_DEFAULTS = {
# unique "install id"
'iid': None,
# TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme
'app_name': 'musical_ly',
'app_version': '34.1.2',
'manifest_app_version': '2023401020',
# "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
'aid': '0',
}
_KNOWN_APP_INFO = [
'7351144126450059040',
'7351149742343391009',
'7351153174894626592',
]
_APP_INFO_POOL = None
_APP_INFO = None
_APP_USER_AGENT = None
@property
def _API_HOSTNAME(self):
return self._configuration_arg(
'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0]
def _get_next_app_info(self):
if self._APP_INFO_POOL is None:
defaults = {
key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0]
for key, default in self._APP_INFO_DEFAULTS.items()
if key != 'iid'
}
app_info_list = (
self._configuration_arg('app_info', ie_key=TikTokIE)
or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO)))
self._APP_INFO_POOL = [
{**defaults, **dict(
(k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
)} for app_info in app_info_list
]
if not self._APP_INFO_POOL:
return False
self._APP_INFO = self._APP_INFO_POOL.pop(0)
app_name = self._APP_INFO['app_name']
version = self._APP_INFO['manifest_app_version']
if app_name == 'musical_ly':
package = f'com.zhiliaoapp.musically/{version}'
else: # trill, aweme
package = f'com.ss.android.ugc.{app_name}/{version}'
self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)'
return True
@staticmethod
def _create_url(user_id, video_id):
return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
@@ -58,7 +105,7 @@ class TikTokBaseIE(InfoExtractor):
'universal data', display_id, end_pattern=r'</script>', default={}),
('__DEFAULT_SCOPE__', {dict})) or {}
def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
def _call_api_impl(self, ep, query, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'):
self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
@@ -67,80 +114,85 @@ class TikTokBaseIE(InfoExtractor):
return self._download_json(
'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
fatal=fatal, note=note, errnote=errnote, headers={
'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)',
'User-Agent': self._APP_USER_AGENT,
'Accept': 'application/json',
}, query=query)
def _build_api_query(self, query, app_version, manifest_app_version):
def _build_api_query(self, query):
return {
**query,
'version_name': app_version,
'version_code': manifest_app_version,
'build_number': app_version,
'manifest_version_code': manifest_app_version,
'update_version_code': manifest_app_version,
'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
'uuid': ''.join(random.choices(string.digits, k=16)),
'_rticket': int(time.time() * 1000),
'ts': int(time.time()),
'device_brand': 'Google',
'device_type': 'Pixel 7',
'device_platform': 'android',
'os': 'android',
'ssmix': 'a',
'_rticket': int(time.time() * 1000),
'cdid': str(uuid.uuid4()),
'channel': 'googleplay',
'aid': self._APP_INFO['aid'],
'app_name': self._APP_INFO['app_name'],
'version_code': ''.join((f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.'))),
'version_name': self._APP_INFO['app_version'],
'manifest_version_code': self._APP_INFO['manifest_app_version'],
'update_version_code': self._APP_INFO['manifest_app_version'],
'ab_version': self._APP_INFO['app_version'],
'resolution': '1080*2400',
'dpi': 420,
'os_version': '13',
'os_api': '29',
'carrier_region': 'US',
'sys_region': 'US',
'region': 'US',
'app_name': self._APP_NAME,
'app_language': 'en',
'device_type': 'Pixel 7',
'device_brand': 'Google',
'language': 'en',
'timezone_name': 'America/New_York',
'timezone_offset': '-14400',
'channel': 'googleplay',
'os_api': '29',
'os_version': '13',
'ac': 'wifi',
'mcc_mnc': '310260',
'is_my_cn': 0,
'aid': self._AID,
'ssmix': 'a',
'as': 'a1qwert123',
'cp': 'cbfhckdckkde1',
'is_pad': '0',
'current_region': 'US',
'app_type': 'normal',
'sys_region': 'US',
'last_install_time': int(time.time()) - random.randint(86400, 1123200),
'timezone_name': 'America/New_York',
'residence': 'US',
'app_language': 'en',
'timezone_offset': '-14400',
'host_abi': 'armeabi-v7a',
'locale': 'en',
'ac2': 'wifi5g',
'uoo': '1',
'carrier_region': 'US',
'op_region': 'US',
'build_number': self._APP_INFO['app_version'],
'region': 'US',
'ts': int(time.time()),
'iid': self._APP_INFO['iid'],
'device_id': random.randint(7250000000000000000, 7351147085025500000),
'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
}
def _call_api(self, ep, query, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'):
if not self._WORKING_APP_VERSION:
app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0]
manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0]
if app_version and manifest_app_version:
self._WORKING_APP_VERSION = (app_version, manifest_app_version)
self.write_debug('Imported app version combo from extractor arguments')
elif app_version or manifest_app_version:
self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True)
if not self._APP_INFO and not self._get_next_app_info():
message = 'No working app info is available'
if fatal:
raise ExtractorError(message, expected=True)
else:
self.report_warning(message)
return
if self._WORKING_APP_VERSION:
app_version, manifest_app_version = self._WORKING_APP_VERSION
real_query = self._build_api_query(query, app_version, manifest_app_version)
return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1):
real_query = self._build_api_query(query, app_version, manifest_app_version)
max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO
for count in itertools.count(1):
self.write_debug(str(self._APP_INFO))
real_query = self._build_api_query(query)
try:
res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
self._WORKING_APP_VERSION = (app_version, manifest_app_version)
return res
return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote)
except ExtractorError as e:
if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
if count == len(self._APP_VERSIONS):
message = str(e.cause or e.msg)
if not self._get_next_app_info():
if fatal:
raise e
raise
else:
self.report_warning(str(e.cause or e.msg))
self.report_warning(message)
return
self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS)))
self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})')
continue
raise e
raise
def _extract_aweme_app(self, aweme_id):
feed_list = self._call_api(
@@ -223,6 +275,7 @@ class TikTokBaseIE(InfoExtractor):
def extract_addr(addr, add_meta={}):
parsed_meta, res = parse_url_key(addr.get('url_key', ''))
is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2'
if res:
known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
@@ -235,8 +288,11 @@ class TikTokBaseIE(InfoExtractor):
'acodec': 'aac',
'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
**add_meta, **parsed_meta,
# bytevc2 is bytedance's proprietary (unplayable) video codec
'preference': -100 if is_bytevc2 else -1,
'format_note': join_nonempty(
add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '),
add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None,
'(UNPLAYABLE)' if is_bytevc2 else None, delim=' '),
**audio_meta(url),
} for url in addr.get('url_list') or []]

View File

@@ -191,17 +191,25 @@ class TwitchBaseIE(InfoExtractor):
}] if thumbnail else None
def _extract_twitch_m3u8_formats(self, path, video_id, token, signature):
return self._extract_m3u8_formats(
formats = self._extract_m3u8_formats(
f'{self._USHER_BASE}/{path}/{video_id}.m3u8', video_id, 'mp4', query={
'allow_source': 'true',
'allow_audio_only': 'true',
'allow_spectre': 'true',
'p': random.randint(1000000, 10000000),
'platform': 'web',
'player': 'twitchweb',
'supported_codecs': 'av1,h265,h264',
'playlist_include_framerate': 'true',
'sig': signature,
'token': token,
})
for fmt in formats:
if fmt.get('vcodec') and fmt['vcodec'].startswith('av01'):
# mpegts does not yet have proper support for av1
fmt['downloader_options'] = {'ffmpeg_args_out': ['-f', 'mp4']}
return formats
class TwitchVodIE(TwitchBaseIE):

View File

@@ -707,6 +707,7 @@ class VKWallPostIE(VKBaseIE):
class VKPlayBaseIE(InfoExtractor):
_BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vkplay\.ru)/'
_RESOLUTIONS = {
'tiny': '256x144',
'lowest': '426x240',
@@ -765,7 +766,7 @@ class VKPlayBaseIE(InfoExtractor):
class VKPlayIE(VKPlayBaseIE):
_VALID_URL = r'https?://vkplay\.live/(?P<username>[^/#?]+)/record/(?P<id>[a-f0-9-]+)'
_VALID_URL = rf'{VKPlayBaseIE._BASE_URL_RE}(?P<username>[^/#?]+)/record/(?P<id>[\da-f-]+)'
_TESTS = [{
'url': 'https://vkplay.live/zitsmann/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da',
'info_dict': {
@@ -776,13 +777,16 @@ class VKPlayIE(VKPlayBaseIE):
'uploader_id': '13159830',
'release_timestamp': 1683461378,
'release_date': '20230507',
'thumbnail': r're:https://images.vkplay.live/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview\?change_time=\d+',
'thumbnail': r're:https://[^/]+/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview',
'duration': 10608,
'view_count': int,
'like_count': int,
'categories': ['Atomic Heart'],
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://live.vkplay.ru/lebwa/record/33a4e4ce-e3ef-49db-bb14-f006cc6fabc9/records',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -802,7 +806,7 @@ class VKPlayIE(VKPlayBaseIE):
class VKPlayLiveIE(VKPlayBaseIE):
_VALID_URL = r'https?://vkplay\.live/(?P<id>[^/#?]+)/?(?:[#?]|$)'
_VALID_URL = rf'{VKPlayBaseIE._BASE_URL_RE}(?P<id>[^/#?]+)/?(?:[#?]|$)'
_TESTS = [{
'url': 'https://vkplay.live/bayda',
'info_dict': {
@@ -813,7 +817,7 @@ class VKPlayLiveIE(VKPlayBaseIE):
'uploader_id': '12279401',
'release_timestamp': 1687209962,
'release_date': '20230619',
'thumbnail': r're:https://images.vkplay.live/public_video_stream/12279401/preview\?change_time=\d+',
'thumbnail': r're:https://[^/]+/public_video_stream/12279401/preview',
'view_count': int,
'concurrent_view_count': int,
'like_count': int,
@@ -822,6 +826,9 @@ class VKPlayLiveIE(VKPlayBaseIE):
},
'skip': 'livestream',
'params': {'skip_download': True},
}, {
'url': 'https://live.vkplay.ru/lebwa',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@@ -16,6 +16,7 @@ from ..utils import (
join_nonempty,
jwt_encode_hs256,
make_archive_id,
merge_dicts,
parse_age_limit,
parse_iso8601,
str_or_none,
@@ -425,3 +426,64 @@ class DagelijkseKostIE(VRTBaseIE):
['description', 'twitter:description', 'og:description'], webpage),
'_old_archive_ids': [make_archive_id('Canvas', video_id)],
}
class Radio1BeIE(VRTBaseIE):
_VALID_URL = r'https?://radio1\.be/(?:lees|luister/select)/(?P<id>[\w/-]+)'
_TESTS = [{
'url': 'https://radio1.be/luister/select/de-ochtend/komt-n-va-volgend-jaar-op-in-wallonie',
'info_dict': {
'id': 'eb6c22e9-544f-44f4-af39-cf8cccd29e22',
'title': 'Komt N-VA volgend jaar op in Wallonië?',
'display_id': 'de-ochtend/komt-n-va-volgend-jaar-op-in-wallonie',
'description': 'md5:b374ea1c9302f38362df9dea1931468e',
'thumbnail': r're:https?://cds\.vrt\.radio/[^/#\?&]+'
},
'playlist_mincount': 1
}, {
'url': 'https://radio1.be/lees/europese-unie-wil-onmiddellijke-humanitaire-pauze-en-duurzaam-staakt-het-vuren-in-gaza?view=web',
'info_dict': {
'id': '5d47f102-dbdb-4fa0-832b-26c1870311f2',
'title': 'Europese Unie wil "onmiddellijke humanitaire pauze" en "duurzaam staakt-het-vuren" in Gaza',
'description': 'md5:1aad1fae7d39edeffde5d3e67d276b64',
'thumbnail': r're:https?://cds\.vrt\.radio/[^/#\?&]+',
'display_id': 'europese-unie-wil-onmiddellijke-humanitaire-pauze-en-duurzaam-staakt-het-vuren-in-gaza'
},
'playlist_mincount': 1
}]
def _extract_video_entries(self, next_js_data, display_id):
video_data = traverse_obj(
next_js_data, ((None, ('paragraphs', ...)), {lambda x: x if x['mediaReference'] else None}))
for data in video_data:
media_reference = data['mediaReference']
formats, subtitles = self._extract_formats_and_subtitles(
self._call_api(media_reference), display_id)
yield {
'id': media_reference,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(data, {
'title': ('title', {str}),
'description': ('body', {clean_html})
}),
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
next_js_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['item']
return self.playlist_result(
self._extract_video_entries(next_js_data, display_id), **merge_dicts(traverse_obj(
next_js_data, ({
'id': ('id', {str}),
'title': ('title', {str}),
'description': (('description', 'content'), {clean_html}),
}), get_all=False), {
'display_id': display_id,
'title': self._html_search_meta(['name', 'og:title', 'twitter:title'], webpage),
'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage),
'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage),
}))

View File

@@ -1,6 +1,6 @@
import base64
import re
import urllib.parse
from base64 import b64decode
from .common import InfoExtractor
from ..networking import HEADRequest
@@ -371,7 +371,7 @@ class WistiaChannelIE(WistiaBaseIE):
webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id)
data = self._parse_json(
self._search_regex(r'wchanneljsonp-%s\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)' % channel_id, webpage, 'jsonp', channel_id),
channel_id, transform_source=lambda x: urllib.parse.unquote_plus(b64decode(x).decode('utf-8')))
channel_id, transform_source=lambda x: urllib.parse.unquote_plus(base64.b64decode(x).decode('utf-8')))
# XXX: can there be more than one series?
series = traverse_obj(data, ('series', 0), default={})

View File

@@ -15,35 +15,35 @@ class XVideosIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
(?:[^/]+\.)?xvideos2?\.com/video|
(?:www\.)?xvideos\.es/video|
(?:[^/]+\.)?xvideos2?\.com/video\.?|
(?:www\.)?xvideos\.es/video\.?|
(?:www|flashservice)\.xvideos\.com/embedframe/|
static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video=
)
(?P<id>[0-9]+)
(?P<id>[0-9a-z]+)
'''
_TESTS = [{
'url': 'https://www.xvideos.com/video4588838/motorcycle_guy_cucks_influencer_steals_his_gf',
'md5': '14cea69fcb84db54293b1e971466c2e1',
'url': 'http://xvideos.com/video.ucuvbkfda4e/a_beautiful_red-haired_stranger_was_refused_but_still_came_to_my_room_for_sex',
'md5': '396255a900a6bddb3e98985f0b86c3fd',
'info_dict': {
'id': '4588838',
'id': 'ucuvbkfda4e',
'ext': 'mp4',
'title': 'Motorcycle Guy Cucks Influencer, Steals his GF',
'duration': 108,
'title': 'A Beautiful Red-Haired Stranger Was Refused, But Still Came To My Room For Sex',
'duration': 1238,
'age_limit': 18,
'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
'thumbnail': r're:^https://cdn\d+-pic.xvideos-cdn.com/.+\.jpg',
}
}, {
# Broken HLS formats
'url': 'https://www.xvideos.com/video65982001/what_s_her_name',
'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5',
'md5': '56742808292c8fa1418e4538c262c58b',
'info_dict': {
'id': '65982001',
'ext': 'mp4',
'title': 'what\'s her name?',
'duration': 120,
'age_limit': 18,
'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
'thumbnail': r're:^https://cdn\d+-pic.xvideos-cdn.com/.+\.jpg',
}
}, {
'url': 'https://flashservice.xvideos.com/embedframe/4588838',
@@ -90,6 +90,18 @@ class XVideosIE(InfoExtractor):
}, {
'url': 'https://de.xvideos.com/video4588838/biker_takes_his_girl',
'only_matching': True
}, {
'url': 'https://flashservice.xvideos.com/embedframe/ucuvbkfda4e',
'only_matching': True,
}, {
'url': 'https://www.xvideos.com/embedframe/ucuvbkfda4e',
'only_matching': True,
}, {
'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=ucuvbkfda4e',
'only_matching': True,
}, {
'url': 'https://xvideos.es/video.ucuvbkfda4e/a_beautiful_red-haired_stranger_was_refused_but_still_came_to_my_room_for_sex',
'only_matching': True
}]
def _real_extract(self, url):

View File

@@ -2,7 +2,7 @@ import base64
import calendar
import collections
import copy
import datetime
import datetime as dt
import enum
import hashlib
import itertools
@@ -33,6 +33,7 @@ from ..utils import (
clean_html,
datetime_from_str,
dict_get,
filesize_from_tbr,
filter_dict,
float_or_none,
format_field,
@@ -55,6 +56,7 @@ from ..utils import (
str_to_int,
strftime_or_none,
traverse_obj,
try_call,
try_get,
unescapeHTML,
unified_strdate,
@@ -922,10 +924,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _parse_time_text(self, text):
if not text:
return
dt = self.extract_relative_time(text)
dt_ = self.extract_relative_time(text)
timestamp = None
if isinstance(dt, datetime.datetime):
timestamp = calendar.timegm(dt.timetuple())
if isinstance(dt_, dt.datetime):
timestamp = calendar.timegm(dt_.timetuple())
if timestamp is None:
timestamp = (
@@ -3631,8 +3633,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
yt_query = {
'videoId': video_id,
}
if _split_innertube_client(client)[0] == 'android':
yt_query['params'] = 'CgIQBg=='
if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'):
yt_query['params'] = 'CgIIAQ=='
pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0]
if pp_arg:
@@ -3863,16 +3865,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_id=video_id, only_once=True)
throttled = True
tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1024)
tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
language_preference = (
10 if audio_track.get('audioIsDefault') and 10
else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10
else -1)
format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)}))
# Some formats may have much smaller duration than others (possibly damaged during encoding)
# E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823
# Make sure to avoid false positives with small duration differences.
# E.g. __2ABJjxzNo, ySuUZEjARPY
is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500)
is_damaged = try_call(lambda: format_duration < duration // 2)
if is_damaged:
self.report_warning(
f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
@@ -3902,6 +3905,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'quality': q(quality) - bool(fmt.get('isDrc')) / 2,
'has_drm': bool(fmt.get('drmFamilies')),
'tbr': tbr,
'filesize_approx': filesize_from_tbr(tbr, format_duration),
'url': fmt_url,
'width': int_or_none(fmt.get('width')),
'language': join_nonempty(audio_track.get('id', '').split('.')[0],
@@ -4596,7 +4600,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'):
# Newly uploaded videos' HLS formats are potentially problematic and need to be checked
upload_datetime = datetime_from_str(upload_date).replace(tzinfo=datetime.timezone.utc)
upload_datetime = datetime_from_str(upload_date).replace(tzinfo=dt.timezone.utc)
if upload_datetime >= datetime_from_str('today-2days'):
for fmt in info['formats']:
if fmt.get('protocol') == 'm3u8_native':
@@ -6997,7 +7001,7 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
IE_DESC = 'YouTube search'
IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
_SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only
_SEARCH_PARAMS = 'EgIQAfABAQ==' # Videos only
_TESTS = [{
'url': 'ytsearch5:youtube-dl test video',
'playlist_count': 5,
@@ -7005,6 +7009,14 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
'id': 'youtube-dl test video',
'title': 'youtube-dl test video',
}
}, {
'note': 'Suicide/self-harm search warning',
'url': 'ytsearch1:i hate myself and i wanna die',
'playlist_count': 1,
'info_dict': {
'id': 'i hate myself and i wanna die',
'title': 'i hate myself and i wanna die',
}
}]
@@ -7012,7 +7024,7 @@ class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = 'YouTube search, newest videos first'
_SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date
_SEARCH_PARAMS = 'CAISAhAB8AEB' # Videos only, sorted by date
_TESTS = [{
'url': 'ytsearchdate5:youtube-dl test video',
'playlist_count': 5,

View File

@@ -1,5 +1,5 @@
import re
from uuid import uuid4
import uuid
from .common import InfoExtractor
from ..compat import compat_str
@@ -53,7 +53,7 @@ class ZattooPlatformBaseIE(InfoExtractor):
self._request_webpage(
'%s/zapi/v3/session/hello' % self._host_url(), None,
'Opening session', data=urlencode_postdata({
'uuid': compat_str(uuid4()),
'uuid': compat_str(uuid.uuid4()),
'lang': 'en',
'app_version': '1.8.2',
'format': 'json',