1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-12-31 03:51:20 +00:00

Merged master

This commit is contained in:
zer0-delta
2024-11-04 23:10:49 +00:00
282 changed files with 11558 additions and 4664 deletions

View File

@@ -4,6 +4,7 @@ import copy
import datetime as dt
import errno
import fileinput
import functools
import http.cookiejar
import io
import itertools
@@ -24,9 +25,9 @@ import traceback
import unicodedata
from .cache import Cache
from .compat import functools, urllib # isort: split
from .compat import urllib # isort: split
from .compat import compat_os_name, urllib_req_to_req
from .cookies import LenientSimpleCookie, load_cookies
from .cookies import CookieLoadError, LenientSimpleCookie, load_cookies
from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
from .downloader.rtmp import rtmpdump_version
from .extractor import gen_extractor_classes, get_info_extractor
@@ -153,12 +154,11 @@ from .utils import (
try_get,
url_basename,
variadic,
version_tuple,
windows_enable_vt_mode,
write_json_file,
write_string,
)
from .utils._utils import _YDLLogger
from .utils._utils import _UnsafeExtensionError, _YDLLogger
from .utils.networking import (
HTTPHeaderDict,
clean_headers,
@@ -171,6 +171,20 @@ if compat_os_name == 'nt':
import ctypes
def _catch_unsafe_extension_error(func):
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
try:
return func(self, *args, **kwargs)
except _UnsafeExtensionError as error:
self.report_error(
f'The extracted extension ({error.extension!r}) is unusual '
'and will be skipped for safety reasons. '
f'If you believe this is an error{bug_reports_message(",")}')
return wrapper
class YoutubeDL:
"""YoutubeDL class.
@@ -236,7 +250,7 @@ class YoutubeDL:
format_sort_force: Force the given format_sort. see "Sorting Formats"
for more details.
prefer_free_formats: Whether to prefer video formats with free containers
over non-free ones of same quality.
over non-free ones of the same quality.
allow_multiple_video_streams: Allow multiple video streams to be merged
into a single file
allow_multiple_audio_streams: Allow multiple audio streams to be merged
@@ -270,7 +284,7 @@ class YoutubeDL:
rejecttitle: Reject downloads for matching titles.
logger: Log messages to a logging.Logger instance.
logtostderr: Print everything to stderr instead of stdout.
consoletitle: Display progress in console window's titlebar.
consoletitle: Display progress in the console window's titlebar.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
clean_infojson: Remove internal metadata from the infojson
@@ -437,7 +451,8 @@ class YoutubeDL:
Can also just be a single color policy,
in which case it applies to all outputs.
Valid stream names are 'stdout' and 'stderr'.
Valid color policies are one of 'always', 'auto', 'no_color' or 'never'.
Valid color policies are one of 'always', 'auto',
'no_color', 'never', 'auto-tty' or 'no_color-tty'.
geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
HTTP header
geo_bypass_country:
@@ -453,8 +468,9 @@ class YoutubeDL:
Set the value to 'native' to use the native downloader
compat_opts: Compatibility options. See "Differences in default behavior".
The following options do not work when used through the API:
filename, abort-on-error, multistreams, no-live-chat, format-sort
no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json.
filename, abort-on-error, multistreams, no-live-chat,
format-sort, no-clean-infojson, no-playlist-metafiles,
no-keep-subs, no-attach-info-json, allow-unsafe-ext, prefer-vp9-sort.
Refer __init__.py for their implementation
progress_template: Dictionary of templates for progress outputs.
Allowed keys are 'download', 'postprocess',
@@ -496,7 +512,7 @@ class YoutubeDL:
The following options are used by the extractors:
extractor_retries: Number of times to retry for known errors (default: 3)
dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
hls_split_discontinuity: Split HLS playlists to different formats at
hls_split_discontinuity: Split HLS playlists into different formats at
discontinuities such as ad breaks (default: False)
extractor_args: A dictionary of arguments to be passed to the extractors.
See "EXTRACTOR ARGUMENTS" for details.
@@ -536,7 +552,7 @@ class YoutubeDL:
include_ads: - Doesn't work
Download ads as well
call_home: - Not implemented
Boolean, true iff we are allowed to contact the
Boolean, true if we are allowed to contact the
yt-dlp servers for debugging.
post_hooks: - Register a custom postprocessor
A list of functions that get called as the final step
@@ -581,8 +597,9 @@ class YoutubeDL:
'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data',
'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies',
'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options',
'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url',
'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version',
'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
}
_deprecated_multivalue_fields = {
'album_artist': 'album_artists',
@@ -642,12 +659,15 @@ class YoutubeDL:
self.params['color'] = 'no_color'
term_allow_color = os.getenv('TERM', '').lower() != 'dumb'
no_color = bool(os.getenv('NO_COLOR'))
base_no_color = bool(os.getenv('NO_COLOR'))
def process_color_policy(stream):
stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream]
policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False)
if policy in ('auto', None):
policy = traverse_obj(self.params, ('color', (stream_name, None), {str}, any)) or 'auto'
if policy in ('auto', 'auto-tty', 'no_color-tty'):
no_color = base_no_color
if policy.endswith('tty'):
no_color = policy.startswith('no_color')
if term_allow_color and supports_terminal_sequences(stream):
return 'no_color' if no_color else True
return False
@@ -1398,6 +1418,7 @@ class YoutubeDL:
outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
return self.escape_outtmpl(outtmpl) % info_dict
@_catch_unsafe_extension_error
def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
if outtmpl is None:
@@ -1602,7 +1623,7 @@ class YoutubeDL:
while True:
try:
return func(self, *args, **kwargs)
except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
except (CookieLoadError, DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
raise
except ReExtractInfo as e:
if e.expected:
@@ -1925,6 +1946,8 @@ class YoutubeDL:
'playlist_title': ie_result.get('title'),
'playlist_uploader': ie_result.get('uploader'),
'playlist_uploader_id': ie_result.get('uploader_id'),
'playlist_channel': ie_result.get('channel'),
'playlist_channel_id': ie_result.get('channel_id'),
**kwargs,
}
if strict:
@@ -2170,9 +2193,8 @@ class YoutubeDL:
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
def _default_format_spec(self, info_dict, download=True):
download = download and not self.params.get('simulate')
prefer_best = download and (
def _default_format_spec(self, info_dict):
prefer_best = (
self.params['outtmpl']['default'] == '-'
or info_dict.get('is_live') and not self.params.get('live_from_start'))
@@ -2180,7 +2202,7 @@ class YoutubeDL:
merger = FFmpegMergerPP(self)
return merger.available and merger.can_merge()
if not prefer_best and download and not can_merge():
if not prefer_best and not can_merge():
prefer_best = True
formats = self._get_formats(info_dict)
evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
@@ -2827,13 +2849,10 @@ class YoutubeDL:
sanitize_string_field(fmt, 'format_id')
sanitize_numeric_fields(fmt)
fmt['url'] = sanitize_url(fmt['url'])
if fmt.get('ext') is None:
fmt['ext'] = determine_ext(fmt['url']).lower()
FormatSorter._fill_sorting_fields(fmt)
if fmt['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'):
if fmt.get('acodec') is None:
fmt['acodec'] = fmt['ext']
if fmt.get('protocol') is None:
fmt['protocol'] = determine_protocol(fmt)
if fmt.get('resolution') is None:
fmt['resolution'] = self.format_resolution(fmt, default=None)
if fmt.get('dynamic_range') is None and fmt.get('vcodec') != 'none':
@@ -2939,7 +2958,7 @@ class YoutubeDL:
continue
if format_selector is None:
req_format = self._default_format_spec(info_dict, download=download)
req_format = self._default_format_spec(info_dict)
self.write_debug(f'Default format spec: {req_format}')
format_selector = self.build_format_selector(req_format)
@@ -3149,11 +3168,12 @@ class YoutubeDL:
if test:
verbose = self.params.get('verbose')
quiet = self.params.get('quiet') or not verbose
params = {
'test': True,
'quiet': self.params.get('quiet') or not verbose,
'quiet': quiet,
'verbose': verbose,
'noprogress': not verbose,
'noprogress': quiet,
'nopart': True,
'skip_unavailable_fragments': False,
'keep_fragments': False,
@@ -3188,6 +3208,7 @@ class YoutubeDL:
os.remove(file)
return None
@_catch_unsafe_extension_error
def process_info(self, info_dict):
"""Process a single resolved IE result. (Modifies it in-place)"""
@@ -3555,6 +3576,8 @@ class YoutubeDL:
def wrapper(*args, **kwargs):
try:
res = func(*args, **kwargs)
except CookieLoadError:
raise
except UnavailableVideoError as e:
self.report_error(e)
except DownloadCancelled as e:
@@ -4043,6 +4066,10 @@ class YoutubeDL:
write_debug(f'Proxy map: {self.proxies}')
write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}')
if os.environ.get('YTDLP_NO_PLUGINS'):
write_debug('Plugins are forcibly disabled')
return
for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items():
display_list = ['{}{}'.format(
klass.__name__, '' if klass.__name__ == name else f' as {name}')
@@ -4058,17 +4085,6 @@ class YoutubeDL:
if plugin_dirs:
write_debug(f'Plugin directories: {plugin_dirs}')
# Not implemented
if False and self.params.get('call_home'):
ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode()
write_debug(f'Public IP address: {ipaddr}')
latest_version = self.urlopen(
'https://yt-dl.org/latest/version').read().decode()
if version_tuple(latest_version) > version_tuple(__version__):
self.report_warning(
f'You are using an outdated version (newest version: {latest_version})! '
'See https://yt-dl.org/update if you need help updating.')
@functools.cached_property
def proxies(self):
"""Global proxy configuration"""
@@ -4088,8 +4104,14 @@ class YoutubeDL:
@functools.cached_property
def cookiejar(self):
"""Global cookiejar instance"""
return load_cookies(
self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self)
try:
return load_cookies(
self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self)
except CookieLoadError as error:
cause = error.__context__
# compat: <=py3.9: `traceback.format_exception` has a different signature
self.report_error(str(cause), tb=''.join(traceback.format_exception(None, cause, cause.__traceback__)))
raise
@property
def _opener(self):

View File

@@ -1,8 +1,8 @@
import sys
if sys.version_info < (3, 8):
if sys.version_info < (3, 9):
raise ImportError(
f'You are using an unsupported version of Python. Only Python versions 3.8 and above are supported by yt-dlp') # noqa: F541
f'You are using an unsupported version of Python. Only Python versions 3.9 and above are supported by yt-dlp') # noqa: F541
__license__ = 'The Unlicense'
@@ -15,7 +15,7 @@ import re
import traceback
from .compat import compat_os_name
from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS
from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS, CookieLoadError
from .downloader.external import get_external_downloader
from .extractor import list_extractor_classes
from .extractor.adobepass import MSO_INFO
@@ -34,6 +34,7 @@ from .postprocessor import (
)
from .update import Updater
from .utils import (
Config,
NO_DEFAULT,
POSTPROCESS_WHEN,
DateRange,
@@ -64,6 +65,7 @@ from .utils import (
write_string,
)
from .utils.networking import std_headers
from .utils._utils import _UnsafeExtensionError
from .YoutubeDL import YoutubeDL
_IN_CLI = False
@@ -157,6 +159,9 @@ def set_compat_opts(opts):
opts.embed_infojson = False
if 'format-sort' in opts.compat_opts:
opts.format_sort.extend(FormatSorter.ytdl_default)
elif 'prefer-vp9-sort' in opts.compat_opts:
opts.format_sort.extend(FormatSorter._prefer_vp9_sort)
_video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False)
_audio_multistreams_set = set_default_compat('multistreams', 'allow_multiple_audio_streams', False, remove_compat=False)
if _video_multistreams_set is False and _audio_multistreams_set is False:
@@ -234,6 +239,11 @@ def validate_options(opts):
validate_regex('format sorting', f, FormatSorter.regex)
# Postprocessor formats
if opts.convertsubtitles == 'none':
opts.convertsubtitles = None
if opts.convertthumbnails == 'none':
opts.convertthumbnails = None
validate_regex('merge output format', opts.merge_output_format,
r'({0})(/({0}))*'.format('|'.join(map(re.escape, FFmpegMergerPP.SUPPORTED_EXTS))))
validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE)
@@ -467,7 +477,7 @@ def validate_options(opts):
default_downloader = ed.get_basename()
for policy in opts.color.values():
if policy not in ('always', 'auto', 'no_color', 'never'):
if policy not in ('always', 'auto', 'auto-tty', 'no_color', 'no_color-tty', 'never'):
raise ValueError(f'"{policy}" is not a valid color policy')
warnings, deprecation_warnings = [], []
@@ -593,6 +603,13 @@ def validate_options(opts):
if opts.ap_username is not None and opts.ap_password is None:
opts.ap_password = getpass.getpass('Type TV provider account password and press [Return]: ')
# compat option changes global state destructively; only allow from cli
if 'allow-unsafe-ext' in opts.compat_opts:
warnings.append(
'Using allow-unsafe-ext opens you up to potential attacks. '
'Use with great care!')
_UnsafeExtensionError.sanitize_extension = lambda x, prepend=False: x
return warnings, deprecation_warnings
@@ -954,6 +971,11 @@ def _real_main(argv=None):
parser, opts, all_urls, ydl_opts = parse_options(argv)
# HACK: Set the plugin dirs early on
# TODO(coletdjnz): remove when plugin globals system is implemented
if opts.plugin_dirs is not None:
Config._plugin_dirs = list(map(expand_path, opts.plugin_dirs))
# Dump user agent
if opts.dump_user_agent:
ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent'])
@@ -1071,7 +1093,7 @@ def main(argv=None):
_IN_CLI = True
try:
_exit(*variadic(_real_main(argv)))
except DownloadError:
except (CookieLoadError, DownloadError):
_exit(1)
except SameFileError as e:
_exit(f'ERROR: {e}')

View File

@@ -230,11 +230,11 @@ def aes_gcm_decrypt_and_verify(data, key, tag, nonce):
iv_ctr = inc(j0)
decrypted_data = aes_ctr_decrypt(data, key, iv_ctr + [0] * (BLOCK_SIZE_BYTES - len(iv_ctr)))
pad_len = len(data) // 16 * 16
pad_len = (BLOCK_SIZE_BYTES - (len(data) % BLOCK_SIZE_BYTES)) % BLOCK_SIZE_BYTES
s_tag = ghash(
hash_subkey,
data
+ [0] * (BLOCK_SIZE_BYTES - len(data) + pad_len) # pad
+ [0] * pad_len # pad
+ bytes_to_intlist((0 * 8).to_bytes(8, 'big') # length of associated data
+ ((len(data) * 8).to_bytes(8, 'big'))), # length of data
)

View File

@@ -57,7 +57,7 @@ def passthrough_module(parent, child, allowed_attributes=(..., ), *, callback=la
callback(attr)
return ret
@functools.lru_cache(maxsize=None)
@functools.cache
def from_child(attr):
nonlocal child
if attr not in allowed_attributes:

View File

@@ -5,8 +5,3 @@ from .compat_utils import passthrough_module
passthrough_module(__name__, 'functools')
del passthrough_module
try:
_ = cache # >= 3.9
except NameError:
cache = lru_cache(maxsize=None)

View File

@@ -1,16 +1,22 @@
tests = {
'webp': lambda h: h[0:4] == b'RIFF' and h[8:] == b'WEBP',
'png': lambda h: h[:8] == b'\211PNG\r\n\032\n',
'jpeg': lambda h: h[6:10] in (b'JFIF', b'Exif'),
'gif': lambda h: h[:6] in (b'GIF87a', b'GIF89a'),
}
def what(file=None, h=None):
"""Detect format of image (Currently supports jpeg, png, webp, gif only)
Ref: https://github.com/python/cpython/blob/3.10/Lib/imghdr.py
Ref: https://github.com/python/cpython/blob/3.11/Lib/imghdr.py
Ref: https://www.w3.org/Graphics/JPEG/itu-t81.pdf
"""
if h is None:
with open(file, 'rb') as f:
h = f.read(12)
return next((type_ for type_, test in tests.items() if test(h)), None)
if h.startswith(b'RIFF') and h.startswith(b'WEBP', 8):
return 'webp'
if h.startswith(b'\x89PNG'):
return 'png'
if h.startswith(b'\xFF\xD8\xFF'):
return 'jpeg'
if h.startswith(b'GIF'):
return 'gif'
return None

View File

@@ -2,7 +2,9 @@ import base64
import collections
import contextlib
import datetime as dt
import functools
import glob
import hashlib
import http.cookiejar
import http.cookies
import io
@@ -17,14 +19,12 @@ import tempfile
import time
import urllib.request
from enum import Enum, auto
from hashlib import pbkdf2_hmac
from .aes import (
aes_cbc_decrypt_bytes,
aes_gcm_decrypt_and_verify_bytes,
unpad_pkcs7,
)
from .compat import functools # isort: split
from .compat import compat_os_name
from .dependencies import (
_SECRETSTORAGE_UNAVAILABLE_REASON,
@@ -34,6 +34,7 @@ from .dependencies import (
from .minicurses import MultilinePrinter, QuietMultilinePrinter
from .utils import (
DownloadError,
YoutubeDLError,
Popen,
error_to_str,
expand_path,
@@ -86,24 +87,31 @@ def _create_progress_bar(logger):
return printer
class CookieLoadError(YoutubeDLError):
pass
def load_cookies(cookie_file, browser_specification, ydl):
cookie_jars = []
if browser_specification is not None:
browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification)
cookie_jars.append(
extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container))
try:
cookie_jars = []
if browser_specification is not None:
browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification)
cookie_jars.append(
extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container))
if cookie_file is not None:
is_filename = is_path_like(cookie_file)
if is_filename:
cookie_file = expand_path(cookie_file)
if cookie_file is not None:
is_filename = is_path_like(cookie_file)
if is_filename:
cookie_file = expand_path(cookie_file)
jar = YoutubeDLCookieJar(cookie_file)
if not is_filename or os.access(cookie_file, os.R_OK):
jar.load()
cookie_jars.append(jar)
jar = YoutubeDLCookieJar(cookie_file)
if not is_filename or os.access(cookie_file, os.R_OK):
jar.load()
cookie_jars.append(jar)
return _merge_cookie_jars(cookie_jars)
return _merge_cookie_jars(cookie_jars)
except Exception:
raise CookieLoadError('failed to load cookies')
def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None, container=None):
@@ -294,12 +302,18 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger):
raise FileNotFoundError(f'could not find {browser_name} cookies database in "{search_root}"')
logger.debug(f'Extracting cookies from: "{cookie_database_path}"')
decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring)
with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir:
cursor = None
try:
cursor = _open_database_copy(cookie_database_path, tmpdir)
# meta_version is necessary to determine if we need to trim the hash prefix from the cookies
# Ref: https://chromium.googlesource.com/chromium/src/+/b02dcebd7cafab92770734dc2bc317bd07f1d891/net/extras/sqlite/sqlite_persistent_cookie_store.cc#223
meta_version = int(cursor.execute('SELECT value FROM meta WHERE key = "version"').fetchone()[0])
decryptor = get_cookie_decryptor(
config['browser_dir'], config['keyring_name'], logger,
keyring=keyring, meta_version=meta_version)
cursor.connection.text_factory = bytes
column_names = _get_column_names(cursor, 'cookies')
secure_column = 'is_secure' if 'is_secure' in column_names else 'secure'
@@ -397,22 +411,23 @@ class ChromeCookieDecryptor:
raise NotImplementedError('Must be implemented by sub classes')
def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None):
def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None, meta_version=None):
if sys.platform == 'darwin':
return MacChromeCookieDecryptor(browser_keyring_name, logger)
return MacChromeCookieDecryptor(browser_keyring_name, logger, meta_version=meta_version)
elif sys.platform in ('win32', 'cygwin'):
return WindowsChromeCookieDecryptor(browser_root, logger)
return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring)
return WindowsChromeCookieDecryptor(browser_root, logger, meta_version=meta_version)
return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring, meta_version=meta_version)
class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):
def __init__(self, browser_keyring_name, logger, *, keyring=None):
def __init__(self, browser_keyring_name, logger, *, keyring=None, meta_version=None):
self._logger = logger
self._v10_key = self.derive_key(b'peanuts')
self._empty_key = self.derive_key(b'')
self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0}
self._browser_keyring_name = browser_keyring_name
self._keyring = keyring
self._meta_version = meta_version or 0
@functools.cached_property
def _v11_key(self):
@@ -441,14 +456,18 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):
if version == b'v10':
self._cookie_counts['v10'] += 1
return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key, self._empty_key), self._logger)
return _decrypt_aes_cbc_multi(
ciphertext, (self._v10_key, self._empty_key), self._logger,
hash_prefix=self._meta_version >= 24)
elif version == b'v11':
self._cookie_counts['v11'] += 1
if self._v11_key is None:
self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True)
return None
return _decrypt_aes_cbc_multi(ciphertext, (self._v11_key, self._empty_key), self._logger)
return _decrypt_aes_cbc_multi(
ciphertext, (self._v11_key, self._empty_key), self._logger,
hash_prefix=self._meta_version >= 24)
else:
self._logger.warning(f'unknown cookie version: "{version}"', only_once=True)
@@ -457,11 +476,12 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):
class MacChromeCookieDecryptor(ChromeCookieDecryptor):
def __init__(self, browser_keyring_name, logger):
def __init__(self, browser_keyring_name, logger, meta_version=None):
self._logger = logger
password = _get_mac_keyring_password(browser_keyring_name, logger)
self._v10_key = None if password is None else self.derive_key(password)
self._cookie_counts = {'v10': 0, 'other': 0}
self._meta_version = meta_version or 0
@staticmethod
def derive_key(password):
@@ -479,7 +499,8 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor):
self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True)
return None
return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key,), self._logger)
return _decrypt_aes_cbc_multi(
ciphertext, (self._v10_key,), self._logger, hash_prefix=self._meta_version >= 24)
else:
self._cookie_counts['other'] += 1
@@ -489,10 +510,11 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor):
class WindowsChromeCookieDecryptor(ChromeCookieDecryptor):
def __init__(self, browser_root, logger):
def __init__(self, browser_root, logger, meta_version=None):
self._logger = logger
self._v10_key = _get_windows_v10_key(browser_root, logger)
self._cookie_counts = {'v10': 0, 'other': 0}
self._meta_version = meta_version or 0
def decrypt(self, encrypted_value):
version = encrypted_value[:3]
@@ -516,7 +538,9 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor):
ciphertext = raw_ciphertext[nonce_length:-authentication_tag_length]
authentication_tag = raw_ciphertext[-authentication_tag_length:]
return _decrypt_aes_gcm(ciphertext, self._v10_key, nonce, authentication_tag, self._logger)
return _decrypt_aes_gcm(
ciphertext, self._v10_key, nonce, authentication_tag, self._logger,
hash_prefix=self._meta_version >= 24)
else:
self._cookie_counts['other'] += 1
@@ -740,40 +764,38 @@ def _get_linux_desktop_environment(env, logger):
xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None)
desktop_session = env.get('DESKTOP_SESSION', None)
if xdg_current_desktop is not None:
xdg_current_desktop = xdg_current_desktop.split(':')[0].strip()
if xdg_current_desktop == 'Unity':
if desktop_session is not None and 'gnome-fallback' in desktop_session:
for part in map(str.strip, xdg_current_desktop.split(':')):
if part == 'Unity':
if desktop_session is not None and 'gnome-fallback' in desktop_session:
return _LinuxDesktopEnvironment.GNOME
else:
return _LinuxDesktopEnvironment.UNITY
elif part == 'Deepin':
return _LinuxDesktopEnvironment.DEEPIN
elif part == 'GNOME':
return _LinuxDesktopEnvironment.GNOME
else:
return _LinuxDesktopEnvironment.UNITY
elif xdg_current_desktop == 'Deepin':
return _LinuxDesktopEnvironment.DEEPIN
elif xdg_current_desktop == 'GNOME':
return _LinuxDesktopEnvironment.GNOME
elif xdg_current_desktop == 'X-Cinnamon':
return _LinuxDesktopEnvironment.CINNAMON
elif xdg_current_desktop == 'KDE':
kde_version = env.get('KDE_SESSION_VERSION', None)
if kde_version == '5':
return _LinuxDesktopEnvironment.KDE5
elif kde_version == '6':
return _LinuxDesktopEnvironment.KDE6
elif kde_version == '4':
return _LinuxDesktopEnvironment.KDE4
else:
logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4')
return _LinuxDesktopEnvironment.KDE4
elif xdg_current_desktop == 'Pantheon':
return _LinuxDesktopEnvironment.PANTHEON
elif xdg_current_desktop == 'XFCE':
return _LinuxDesktopEnvironment.XFCE
elif xdg_current_desktop == 'UKUI':
return _LinuxDesktopEnvironment.UKUI
elif xdg_current_desktop == 'LXQt':
return _LinuxDesktopEnvironment.LXQT
else:
logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"')
elif part == 'X-Cinnamon':
return _LinuxDesktopEnvironment.CINNAMON
elif part == 'KDE':
kde_version = env.get('KDE_SESSION_VERSION', None)
if kde_version == '5':
return _LinuxDesktopEnvironment.KDE5
elif kde_version == '6':
return _LinuxDesktopEnvironment.KDE6
elif kde_version == '4':
return _LinuxDesktopEnvironment.KDE4
else:
logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4')
return _LinuxDesktopEnvironment.KDE4
elif part == 'Pantheon':
return _LinuxDesktopEnvironment.PANTHEON
elif part == 'XFCE':
return _LinuxDesktopEnvironment.XFCE
elif part == 'UKUI':
return _LinuxDesktopEnvironment.UKUI
elif part == 'LXQt':
return _LinuxDesktopEnvironment.LXQT
logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"')
elif desktop_session is not None:
if desktop_session == 'deepin':
@@ -1001,13 +1023,15 @@ def _get_windows_v10_key(browser_root, logger):
def pbkdf2_sha1(password, salt, iterations, key_length):
return pbkdf2_hmac('sha1', password, salt, iterations, key_length)
return hashlib.pbkdf2_hmac('sha1', password, salt, iterations, key_length)
def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16):
def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16, hash_prefix=False):
for key in keys:
plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector))
try:
if hash_prefix:
return plaintext[32:].decode()
return plaintext.decode()
except UnicodeDecodeError:
pass
@@ -1015,7 +1039,7 @@ def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' '
return None
def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger):
def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger, hash_prefix=False):
try:
plaintext = aes_gcm_decrypt_and_verify_bytes(ciphertext, key, authentication_tag, nonce)
except ValueError:
@@ -1023,6 +1047,8 @@ def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger):
return None
try:
if hash_prefix:
return plaintext[32:].decode()
return plaintext.decode()
except UnicodeDecodeError:
logger.warning('failed to decrypt cookie (AES-GCM) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True)
@@ -1055,8 +1081,9 @@ def _decrypt_windows_dpapi(ciphertext, logger):
ctypes.byref(blob_out), # pDataOut
)
if not ret:
logger.warning('failed to decrypt with DPAPI', only_once=True)
return None
message = 'Failed to decrypt with DPAPI. See https://github.com/yt-dlp/yt-dlp/issues/10927 for more info'
logger.error(message)
raise DownloadError(message) # force exit
result = ctypes.string_at(blob_out.pbData, blob_out.cbData)
ctypes.windll.kernel32.LocalFree(blob_out.pbData)

View File

@@ -1,4 +1,5 @@
import enum
import functools
import json
import os
import re
@@ -9,7 +10,6 @@ import time
import uuid
from .fragment import FragmentFD
from ..compat import functools
from ..networking import Request
from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor
from ..utils import (
@@ -108,7 +108,7 @@ class ExternalFD(FragmentFD):
return all((
not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES,
'+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES,
not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url'),
not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url', 'extra_param_to_key_url'),
all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')),
))
@@ -508,7 +508,7 @@ class FFmpegFD(ExternalFD):
env = None
proxy = self.params.get('proxy')
if proxy:
if not re.match(r'^[\da-zA-Z]+://', proxy):
if not re.match(r'[\da-zA-Z]+://', proxy):
proxy = f'http://{proxy}'
if proxy.startswith('socks'):
@@ -559,7 +559,7 @@ class FFmpegFD(ExternalFD):
selected_formats = info_dict.get('requested_formats') or [info_dict]
for i, fmt in enumerate(selected_formats):
is_http = re.match(r'^https?://', fmt['url'])
is_http = re.match(r'https?://', fmt['url'])
cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else []
if cookies:
args.extend(['-cookies', ''.join(

View File

@@ -160,10 +160,12 @@ class HlsFD(FragmentFD):
extra_state = ctx.setdefault('extra_state', {})
format_index = info_dict.get('format_index')
extra_query = None
extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
if extra_param_to_segment_url:
extra_query = urllib.parse.parse_qs(extra_param_to_segment_url)
extra_segment_query = None
if extra_param_to_segment_url := info_dict.get('extra_param_to_segment_url'):
extra_segment_query = urllib.parse.parse_qs(extra_param_to_segment_url)
extra_key_query = None
if extra_param_to_key_url := info_dict.get('extra_param_to_key_url'):
extra_key_query = urllib.parse.parse_qs(extra_param_to_key_url)
i = 0
media_sequence = 0
decrypt_info = {'METHOD': 'NONE'}
@@ -190,8 +192,8 @@ class HlsFD(FragmentFD):
if frag_index <= ctx['fragment_index']:
continue
frag_url = urljoin(man_url, line)
if extra_query:
frag_url = update_url_query(frag_url, extra_query)
if extra_segment_query:
frag_url = update_url_query(frag_url, extra_segment_query)
fragments.append({
'frag_index': frag_index,
@@ -212,8 +214,8 @@ class HlsFD(FragmentFD):
frag_index += 1
map_info = parse_m3u8_attributes(line[11:])
frag_url = urljoin(man_url, map_info.get('URI'))
if extra_query:
frag_url = update_url_query(frag_url, extra_query)
if extra_segment_query:
frag_url = update_url_query(frag_url, extra_segment_query)
if map_info.get('BYTERANGE'):
splitted_byte_range = map_info.get('BYTERANGE').split('@')
@@ -244,8 +246,10 @@ class HlsFD(FragmentFD):
decrypt_info['KEY'] = external_aes_key
else:
decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI'])
if extra_query:
decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
if extra_key_query or extra_segment_query:
# Fall back to extra_segment_query to key for backwards compat
decrypt_info['URI'] = update_url_query(
decrypt_info['URI'], extra_key_query or extra_segment_query)
if decrypt_url != decrypt_info['URI']:
decrypt_info['KEY'] = None

View File

@@ -76,6 +76,7 @@ from .aenetworks import (
)
from .aeonco import AeonCoIE
from .afreecatv import (
AfreecaTVCatchStoryIE,
AfreecaTVIE,
AfreecaTVLiveIE,
AfreecaTVUserIE,
@@ -216,9 +217,8 @@ from .bbc import (
BBCCoUkIPlayerGroupIE,
BBCCoUkPlaylistIE,
)
from .bbcmaestro import (
BBCMaestroComIE,
)
from .bbcmaestro import BBCMaestroComIE
from .beacon import BeaconTvIE
from .beatbump import (
BeatBumpPlaylistIE,
BeatBumpVideoIE,
@@ -279,6 +279,7 @@ from .bleacherreport import (
from .blerp import BlerpIE
from .blogger import BloggerIE
from .bloomberg import BloombergIE
from .bluesky import BlueskyIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
from .boosty import BoostyIE
@@ -364,7 +365,10 @@ from .ccc import (
)
from .ccma import CCMAIE
from .cctv import CCTVIE
from .cda import CDAIE
from .cda import (
CDAIE,
CDAFolderIE,
)
from .cellebrite import CellebriteIE
from .ceskatelevize import CeskaTelevizeIE
from .cgtn import CGTNIE
@@ -399,8 +403,6 @@ from .cmt import CMTIE
from .cnbc import CNBCVideoIE
from .cnn import (
CNNIE,
CNNArticleIE,
CNNBlogsIE,
CNNIndonesiaIE,
)
from .comedycentral import (
@@ -506,7 +508,6 @@ from .dhm import DHMIE
from .digitalconcerthall import DigitalConcertHallIE
from .digiteka import DigitekaIE
from .discogs import DiscogsReleasePlaylistIE
from .discovery import DiscoveryIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
from .dlf import (
@@ -534,16 +535,12 @@ from .dplay import (
DiscoveryPlusIndiaShowIE,
DiscoveryPlusItalyIE,
DiscoveryPlusItalyShowIE,
DIYNetworkIE,
DPlayIE,
FoodNetworkIE,
GlobalCyclingNetworkPlusIE,
GoDiscoveryIE,
HGTVDeIE,
HGTVUsaIE,
InvestigationDiscoveryIE,
MotorTrendIE,
MotorTrendOnDemandIE,
ScienceChannelIE,
TravelChannelIE,
)
@@ -736,6 +733,7 @@ from .genius import (
GeniusIE,
GeniusLyricsIE,
)
from .germanupa import GermanupaIE
from .getcourseru import (
GetCourseRuIE,
GetCourseRuPlayerIE,
@@ -782,6 +780,7 @@ from .gopro import GoProIE
from .goshgay import GoshgayIE
from .gotostage import GoToStageIE
from .gputechconf import GPUTechConfIE
from .graspop import GraspopIE
from .gronkh import (
GronkhFeedIE,
GronkhIE,
@@ -828,7 +827,10 @@ from .hungama import (
HungamaIE,
HungamaSongIE,
)
from .huya import HuyaLiveIE
from .huya import (
HuyaLiveIE,
HuyaVideoIE,
)
from .hypem import HypemIE
from .hypergryph import MonsterSirenHypergryphMusicIE
from .hytale import HytaleIE
@@ -945,11 +947,13 @@ from .khanacademy import (
KhanAcademyUnitIE,
)
from .kick import (
KickClipIE,
KickIE,
KickVODIE,
)
from .kicker import KickerIE
from .kickstarter import KickStarterIE
from .kika import KikaIE
from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE
from .kommunetv import KommunetvIE
@@ -972,6 +976,10 @@ from .la7 import (
LA7PodcastEpisodeIE,
LA7PodcastIE,
)
from .laracasts import (
LaracastsIE,
LaracastsPlaylistIE,
)
from .lastfm import (
LastFMIE,
LastFMPlaylistIE,
@@ -988,6 +996,7 @@ from .lcp import (
LcpIE,
LcpPlayIE,
)
from .learningonscreen import LearningOnScreenIE
from .lecture2go import Lecture2GoIE
from .lecturio import (
LecturioCourseIE,
@@ -1036,10 +1045,7 @@ from .livestream import (
LivestreamShortenerIE,
)
from .livestreamfails import LivestreamfailsIE
from .lnkgo import (
LnkGoIE,
LnkIE,
)
from .lnk import LnkIE
from .loom import (
LoomFolderIE,
LoomIE,
@@ -1116,12 +1122,15 @@ from .meipai import MeipaiIE
from .melonvod import MelonVODIE
from .metacritic import MetacriticIE
from .mgtv import MGTVIE
from .microsoftembed import MicrosoftEmbedIE
from .microsoftstream import MicrosoftStreamIE
from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyCourseIE,
MicrosoftVirtualAcademyIE,
from .microsoftembed import (
MicrosoftBuildIE,
MicrosoftEmbedIE,
MicrosoftLearnEpisodeIE,
MicrosoftLearnPlaylistIE,
MicrosoftLearnSessionIE,
MicrosoftMediusIE,
)
from .microsoftstream import MicrosoftStreamIE
from .mildom import (
MildomClipIE,
MildomIE,
@@ -1161,6 +1170,7 @@ from .mlb import (
)
from .mlssoccer import MLSSoccerIE
from .mocha import MochaVideoIE
from .mojevideo import MojevideoIE
from .mojvideo import MojvideoIE
from .monstercat import MonstercatIE
from .motherless import (
@@ -1606,6 +1616,7 @@ from .qqmusic import (
QQMusicPlaylistIE,
QQMusicSingerIE,
QQMusicToplistIE,
QQMusicVideoIE,
)
from .r7 import (
R7IE,
@@ -1758,7 +1769,10 @@ from .rtve import (
RTVETelevisionIE,
)
from .rtvs import RTVSIE
from .rtvslo import RTVSLOIE
from .rtvslo import (
RTVSLOIE,
RTVSLOShowIE,
)
from .rudovideo import RudoVideoIE
from .rule34video import Rule34VideoIE
from .rumble import (
@@ -1803,6 +1817,7 @@ from .screen9 import Screen9IE
from .screencast import ScreencastIE
from .screencastify import ScreencastifyIE
from .screencastomatic import ScreencastOMaticIE
from .screenrec import ScreenRecIE
from .scrippsnetworks import (
ScrippsNetworksIE,
ScrippsNetworksWatchIE,
@@ -1813,6 +1828,7 @@ from .scte import (
SCTECourseIE,
)
from .sejmpl import SejmIE
from .sen import SenIE
from .senalcolombia import SenalColombiaLiveIE
from .senategov import (
SenateGovIE,
@@ -1868,6 +1884,7 @@ from .slideshare import SlideshareIE
from .slideslive import SlidesLiveIE
from .slutload import SlutloadIE
from .smotrim import SmotrimIE
from .snapchat import SnapchatSpotlightIE
from .snotr import SnotrIE
from .sohu import (
SohuIE,
@@ -1928,6 +1945,10 @@ from .spreaker import (
)
from .springboardplatform import SpringboardPlatformIE
from .sprout import SproutIE
from .sproutvideo import (
SproutVideoIE,
VidsIoIE,
)
from .srgssr import (
SRGSSRIE,
SRGSSRPlayIE,
@@ -2160,10 +2181,7 @@ from .tv5unis import (
TV5UnisVideoIE,
)
from .tv24ua import TV24UAVideoIE
from .tva import (
TVAIE,
QubIE,
)
from .tva import TVAIE
from .tvanouvelles import (
TVANouvellesArticleIE,
TVANouvellesIE,
@@ -2303,6 +2321,7 @@ from .videomore import (
VideomoreVideoIE,
)
from .videopress import VideoPressIE
from .vidflex import VidflexIE
from .vidio import (
VidioIE,
VidioLiveIE,
@@ -2310,6 +2329,7 @@ from .vidio import (
)
from .vidlii import VidLiiIE
from .vidly import VidlyIE
from .vidyard import VidyardIE
from .viewlift import (
ViewLiftEmbedIE,
ViewLiftIE,
@@ -2375,6 +2395,10 @@ from .vrt import (
VrtNUIE,
)
from .vtm import VTMIE
from .vtv import (
VTVIE,
VTVGoIE,
)
from .vuclip import VuClipIE
from .vvvvid import (
VVVVIDIE,

View File

@@ -387,17 +387,27 @@ class ABCIViewShowSeriesIE(InfoExtractor):
'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$',
},
'playlist_count': 15,
'skip': 'This program is not currently available in ABC iview',
}, {
'url': 'https://iview.abc.net.au/show/inbestigators',
'info_dict': {
'id': '175343-1',
'title': 'Series 1',
'description': 'md5:b9976935a6450e5b78ce2a940a755685',
'series': 'The Inbestigators',
'season': 'Series 1',
'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.+\.jpg',
},
'playlist_count': 17,
}]
def _real_extract(self, url):
show_id = self._match_id(url)
webpage = self._download_webpage(url, show_id)
webpage_data = self._search_regex(
r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;',
webpage, 'initial state')
video_data = self._parse_json(
unescapeHTML(webpage_data).encode().decode('unicode_escape'), show_id)
video_data = video_data['route']['pageData']['_embedded']
video_data = self._search_json(
r'window\.__INITIAL_STATE__\s*=\s*[\'"]', webpage, 'initial state', show_id,
transform_source=lambda x: x.encode().decode('unicode_escape'),
end_pattern=r'[\'"]\s*;')['route']['pageData']['_embedded']
highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl'])
if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'):

View File

@@ -9,12 +9,12 @@ import re
import struct
import time
import urllib.parse
import urllib.request
import urllib.response
import uuid
from .common import InfoExtractor
from ..aes import aes_ecb_decrypt
from ..networking import RequestHandler, Response
from ..networking.exceptions import TransportError
from ..utils import (
ExtractorError,
OnDemandPagedList,
@@ -26,37 +26,36 @@ from ..utils import (
traverse_obj,
update_url_query,
)
from ..utils.networking import clean_proxies
def add_opener(ydl, handler): # FIXME: Create proper API in .networking
"""Add a handler for opening URLs, like _download_webpage"""
# https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
# https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
rh = ydl._request_director.handlers['Urllib']
if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
return
headers = ydl.params['http_headers'].copy()
proxies = ydl.proxies.copy()
clean_proxies(proxies, headers)
opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies)
assert isinstance(opener, urllib.request.OpenerDirector)
opener.add_handler(handler)
rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')
class AbemaLicenseRH(RequestHandler):
_SUPPORTED_URL_SCHEMES = ('abematv-license',)
_SUPPORTED_PROXY_SCHEMES = None
_SUPPORTED_FEATURES = None
RH_NAME = 'abematv_license'
_STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
_HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
class AbemaLicenseHandler(urllib.request.BaseHandler):
handler_order = 499
STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
def __init__(self, ie: 'AbemaTVIE'):
# the protocol that this should really handle is 'abematv-license://'
# abematv_license_open is just a placeholder for development purposes
# ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open', None))
def __init__(self, *, ie: 'AbemaTVIE', **kwargs):
super().__init__(**kwargs)
self.ie = ie
def _send(self, request):
url = request.url
ticket = urllib.parse.urlparse(url).netloc
try:
response_data = self._get_videokey_from_ticket(ticket)
except ExtractorError as e:
raise TransportError(cause=e.cause) from e
except (IndexError, KeyError, TypeError) as e:
raise TransportError(cause=repr(e)) from e
return Response(
io.BytesIO(response_data), url,
headers={'Content-Length': str(len(response_data))})
def _get_videokey_from_ticket(self, ticket):
to_show = self.ie.get_param('verbose', False)
media_token = self.ie._get_media_token(to_show=to_show)
@@ -72,25 +71,17 @@ class AbemaLicenseHandler(urllib.request.BaseHandler):
'Content-Type': 'application/json',
})
res = decode_base_n(license_response['k'], table=self.STRTABLE)
res = decode_base_n(license_response['k'], table=self._STRTABLE)
encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
h = hmac.new(
binascii.unhexlify(self.HKEY),
binascii.unhexlify(self._HKEY),
(license_response['cid'] + self.ie._DEVICE_ID).encode(),
digestmod=hashlib.sha256)
enckey = bytes_to_intlist(h.digest())
return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
def abematv_license_open(self, url):
url = url.get_full_url() if isinstance(url, urllib.request.Request) else url
ticket = urllib.parse.urlparse(url).netloc
response_data = self._get_videokey_from_ticket(ticket)
return urllib.response.addinfourl(io.BytesIO(response_data), headers={
'Content-Length': str(len(response_data)),
}, url=url, code=200)
class AbemaTVBaseIE(InfoExtractor):
_NETRC_MACHINE = 'abematv'
@@ -139,7 +130,7 @@ class AbemaTVBaseIE(InfoExtractor):
if self._USERTOKEN:
return self._USERTOKEN
add_opener(self._downloader, AbemaLicenseHandler(self))
self._downloader._request_director.add_handler(AbemaLicenseRH(ie=self, logger=None))
username, _ = self._get_login_info()
auth_cache = username and self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19')
@@ -368,6 +359,7 @@ class AbemaTVIE(AbemaTVBaseIE):
info['episode_number'] = epis if epis < 2000 else None
is_live, m3u8_url = False, None
availability = 'public'
if video_type == 'now-on-air':
is_live = True
channel_url = 'https://api.abema.io/v1/channels'
@@ -385,10 +377,10 @@ class AbemaTVIE(AbemaTVBaseIE):
f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
note='Checking playability',
headers=headers)
ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'))
if 3 not in ondemand_types:
if not traverse_obj(api_response, ('label', 'free', {bool})):
# cannot acquire decryption key for these streams
self.report_warning('This is a premium-only stream')
availability = 'premium_only'
info.update(traverse_obj(api_response, {
'series': ('series', 'title'),
'season': ('season', 'name'),
@@ -408,6 +400,7 @@ class AbemaTVIE(AbemaTVBaseIE):
headers=headers)
if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
self.report_warning('This is a premium-only stream')
availability = 'premium_only'
m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
else:
@@ -425,6 +418,7 @@ class AbemaTVIE(AbemaTVBaseIE):
'description': description,
'formats': formats,
'is_live': is_live,
'availability': availability,
})
return info

View File

@@ -4,7 +4,7 @@ from .common import InfoExtractor
class AcademicEarthCourseIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
_VALID_URL = r'https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
IE_NAME = 'AcademicEarth:Course'
_TEST = {
'url': 'http://academicearth.org/playlists/laws-of-nature/',

View File

@@ -16,6 +16,7 @@ from ..utils import (
float_or_none,
int_or_none,
intlist_to_bytes,
join_nonempty,
long_to_bytes,
parse_iso8601,
pkcs1pad,
@@ -48,9 +49,9 @@ class ADNBaseIE(InfoExtractor):
class ADNIE(ADNBaseIE):
_VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.(?P<lang>fr|de)/video/[^/?#]+/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?animationdigitalnetwork\.com/(?:(?P<lang>de)/)?video/[^/?#]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir',
'url': 'https://animationdigitalnetwork.com/video/558-fruits-basket/9841-episode-1-a-ce-soir',
'md5': '1c9ef066ceb302c86f80c2b371615261',
'info_dict': {
'id': '9841',
@@ -70,10 +71,7 @@ class ADNIE(ADNBaseIE):
},
'skip': 'Only available in French and German speaking Europe',
}, {
'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
'only_matching': True,
}, {
'url': 'https://animationdigitalnetwork.de/video/the-eminence-in-shadow/23550-folge-1',
'url': 'https://animationdigitalnetwork.com/de/video/973-the-eminence-in-shadow/23550-folge-1',
'md5': '5c5651bf5791fa6fcd7906012b9d94e8',
'info_dict': {
'id': '23550',
@@ -166,7 +164,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
'username': username,
})) or {}).get('accessToken')
if access_token:
self._HEADERS = {'authorization': 'Bearer ' + access_token}
self._HEADERS['Authorization'] = f'Bearer {access_token}'
except ExtractorError as e:
message = None
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
@@ -177,6 +175,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
def _real_extract(self, url):
lang, video_id = self._match_valid_url(url).group('lang', 'id')
self._HEADERS['X-Target-Distribution'] = lang or 'fr'
video_base_url = self._PLAYER_BASE_URL + f'video/{video_id}/'
player = self._download_json(
video_base_url + 'configuration', video_id,
@@ -217,7 +216,6 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
links_data = self._download_json(
links_url, video_id, 'Downloading links JSON metadata', headers={
'X-Player-Token': authorization,
'X-Target-Distribution': lang,
**self._HEADERS,
}, query={
'freeWithAds': 'true',
@@ -256,6 +254,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
load_balancer_data = self._download_json(
load_balancer_url, video_id,
f'Downloading {format_id} {quality} JSON metadata',
headers=self._HEADERS,
fatal=False) or {}
m3u8_url = load_balancer_data.get('location')
if not m3u8_url:
@@ -276,7 +275,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
video = (self._download_json(
self._API_BASE_URL + f'video/{video_id}', video_id,
'Downloading additional video metadata', fatal=False) or {}).get('video') or {}
'Downloading additional video metadata', fatal=False, headers=self._HEADERS) or {}).get('video') or {}
show = video.get('show') or {}
return {
@@ -298,9 +297,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
class ADNSeasonIE(ADNBaseIE):
_VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.(?P<lang>fr|de)/video/(?P<id>[^/?#]+)/?(?:$|[#?])'
_VALID_URL = r'https?://(?:www\.)?animationdigitalnetwork\.com/(?:(?P<lang>de)/)?video/(?P<id>\d+)[^/?#]*/?(?:$|[#?])'
_TESTS = [{
'url': 'https://animationdigitalnetwork.fr/video/tokyo-mew-mew-new',
'url': 'https://animationdigitalnetwork.com/video/911-tokyo-mew-mew-new',
'playlist_count': 12,
'info_dict': {
'id': '911',
@@ -311,24 +310,22 @@ class ADNSeasonIE(ADNBaseIE):
def _real_extract(self, url):
lang, video_show_slug = self._match_valid_url(url).group('lang', 'id')
self._HEADERS['X-Target-Distribution'] = lang or 'fr'
show = self._download_json(
f'{self._API_BASE_URL}show/{video_show_slug}/', video_show_slug,
'Downloading show JSON metadata', headers=self._HEADERS)['show']
show_id = str(show['id'])
episodes = self._download_json(
f'{self._API_BASE_URL}video/show/{show_id}', video_show_slug,
'Downloading episode list', headers={
'X-Target-Distribution': lang,
**self._HEADERS,
}, query={
'Downloading episode list', headers=self._HEADERS, query={
'order': 'asc',
'limit': '-1',
})
def entries():
for episode_id in traverse_obj(episodes, ('videos', ..., 'id', {str_or_none})):
yield self.url_result(
f'https://animationdigitalnetwork.{lang}/video/{video_show_slug}/{episode_id}',
ADNIE, episode_id)
yield self.url_result(join_nonempty(
'https://animationdigitalnetwork.com', lang, 'video',
video_show_slug, episode_id, delim='/'), ADNIE, episode_id)
return self.playlist_result(entries(), show_id, show.get('title'))

View File

@@ -1355,6 +1355,7 @@ MSO_INFO = {
class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
_SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s'
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_MODERN_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:131.0) Gecko/20100101 Firefox/131.0'
_MVPD_CACHE = 'ap-mvpd'
_DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page'
@@ -1454,7 +1455,11 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
'no_iframe': 'false',
'domain_name': 'adobe.com',
'redirect_url': url,
})
}, headers={
# yt-dlp's default user-agent is usually too old for Comcast_SSO
# See: https://github.com/yt-dlp/yt-dlp/issues/10848
'User-Agent': self._MODERN_USER_AGENT,
} if mso_id == 'Comcast_SSO' else None)
elif not self._cookies_passed:
raise_mvpd_required()

View File

@@ -1,6 +1,7 @@
import functools
from .common import InfoExtractor
from ..networking import Request
from ..utils import (
ExtractorError,
OnDemandPagedList,
@@ -32,21 +33,21 @@ class AfreecaTVBaseIE(InfoExtractor):
}
response = self._download_json(
'https://login.afreecatv.com/app/LoginAction.php', None,
'https://login.sooplive.co.kr/app/LoginAction.php', None,
'Logging in', data=urlencode_postdata(login_form))
_ERRORS = {
-4: 'Your account has been suspended due to a violation of our terms and policies.',
-5: 'https://member.afreecatv.com/app/user_delete_progress.php',
-6: 'https://login.afreecatv.com/membership/changeMember.php',
-8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
-9: 'https://member.afreecatv.com/app/pop_login_block.php',
-11: 'https://login.afreecatv.com/afreeca/second_login.php',
-12: 'https://member.afreecatv.com/app/user_security.php',
-5: 'https://member.sooplive.co.kr/app/user_delete_progress.php',
-6: 'https://login.sooplive.co.kr/membership/changeMember.php',
-8: "Hello! Soop here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
-9: 'https://member.sooplive.co.kr/app/pop_login_block.php',
-11: 'https://login.sooplive.co.kr/afreeca/second_login.php',
-12: 'https://member.sooplive.co.kr/app/user_security.php',
0: 'The username does not exist or you have entered the wrong password.',
-1: 'The username does not exist or you have entered the wrong password.',
-3: 'You have entered your username/password incorrectly.',
-7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
-7: 'You cannot use your Global Soop account to access Korean Soop.',
-10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
-32008: 'You have failed to log in. Please contact our Help Center.',
}
@@ -58,71 +59,42 @@ class AfreecaTVBaseIE(InfoExtractor):
f'Unable to login: {self.IE_NAME} said: {error}',
expected=True)
def _call_api(self, endpoint, display_id, data=None, headers=None, query=None):
return self._download_json(Request(
f'https://api.m.sooplive.co.kr/{endpoint}',
data=data, headers=headers, query=query,
extensions={'legacy_ssl': True}), display_id,
'Downloading API JSON', 'Unable to download API JSON')
class AfreecaTVIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv'
IE_DESC = 'afreecatv.com'
_VALID_URL = r'''(?x)
https?://
(?:
(?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
(?:
/app/(?:index|read_ucc_bbs)\.cgi|
/player/[Pp]layer\.(?:swf|html)
)\?.*?\bnTitleNo=|
vod\.afreecatv\.com/(PLAYER/STATION|player)/
)
(?P<id>\d+)
'''
IE_NAME = 'soop'
IE_DESC = 'sooplive.co.kr'
_VALID_URL = r'https?://vod\.(?:sooplive\.co\.kr|afreecatv\.com)/(?:PLAYER/STATION|player)/(?P<id>\d+)/?(?:$|[?#&])'
_TESTS = [{
'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
'url': 'https://vod.sooplive.co.kr/player/96753363',
'info_dict': {
'id': '36164052',
'id': '20230108_9FF5BEE1_244432674_1',
'ext': 'mp4',
'title': '데일리 에이프릴 요정들의 시상식!',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160503',
'uploader_id': 'rlantnghks',
'uploader': '페이즈으',
'duration': 10840,
'thumbnail': r're:https?://videoimg\.sooplive\.co/.kr/.+',
'upload_date': '20230108',
'timestamp': 1673218805,
'title': '젠지 페이즈',
},
'skip': 'Video is gone',
}, {
'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
'info_dict': {
'id': '36153164',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'params': {
'skip_download': True,
},
'playlist_count': 2,
'playlist': [{
'md5': 'd8b7c174568da61d774ef0203159bf97',
'info_dict': {
'id': '36153164_1',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'upload_date': '20160502',
},
}, {
'md5': '58f2ce7f6044e34439ab2d50612ab02b',
'info_dict': {
'id': '36153164_2',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'upload_date': '20160502',
},
}],
'skip': 'Video is gone',
}, {
# non standard key
'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605',
'url': 'http://vod.sooplive.co.kr/PLAYER/STATION/20515605',
'info_dict': {
'id': '20170411_BE689A0E_190960999_1_2_h',
'ext': 'mp4',
'title': '혼자사는여자집',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+',
'uploader': '♥이슬이',
'uploader_id': 'dasl8121',
'upload_date': '20170411',
@@ -134,12 +106,12 @@ class AfreecaTVIE(AfreecaTVBaseIE):
},
}, {
# adult content
'url': 'https://vod.afreecatv.com/player/97267690',
'url': 'https://vod.sooplive.co.kr/player/97267690',
'info_dict': {
'id': '20180327_27901457_202289533_1',
'ext': 'mp4',
'title': '[생]빨개요♥ (part 1)',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+',
'uploader': '[SA]서아',
'uploader_id': 'bjdyrksu',
'upload_date': '20180327',
@@ -149,44 +121,25 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'skip_download': True,
},
'skip': 'The VOD does not exist',
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
'only_matching': True,
}, {
'url': 'https://vod.afreecatv.com/player/96753363',
'info_dict': {
'id': '20230108_9FF5BEE1_244432674_1',
'ext': 'mp4',
'uploader_id': 'rlantnghks',
'uploader': '페이즈으',
'duration': 10840,
'thumbnail': r're:https?://videoimg\.afreecatv\.com/.+',
'upload_date': '20230108',
'timestamp': 1673218805,
'title': '젠지 페이즈',
},
'params': {
'skip_download': True,
},
}, {
# adult content
'url': 'https://vod.afreecatv.com/player/70395877',
'url': 'https://vod.sooplive.co.kr/player/70395877',
'only_matching': True,
}, {
# subscribers only
'url': 'https://vod.afreecatv.com/player/104647403',
'url': 'https://vod.sooplive.co.kr/player/104647403',
'only_matching': True,
}, {
# private
'url': 'https://vod.afreecatv.com/player/81669846',
'url': 'https://vod.sooplive.co.kr/player/81669846',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
'https://api.m.afreecatv.com/station/video/a/view', video_id,
headers={'Referer': url}, data=urlencode_postdata({
data = self._call_api(
'station/video/a/view', video_id, headers={'Referer': url},
data=urlencode_postdata({
'nTitleNo': video_id,
'nApiLevel': 10,
}))['data']
@@ -201,7 +154,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'title': ('title', {str}),
'uploader': ('writer_nick', {str}),
'uploader_id': ('bj_id', {str}),
'duration': ('total_file_duration', {functools.partial(int_or_none, scale=1000)}),
'duration': ('total_file_duration', {int_or_none(scale=1000)}),
'thumbnail': ('thumb', {url_or_none}),
})
@@ -225,7 +178,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'title': f'{common_info.get("title") or "Untitled"} (part {file_num})',
'formats': formats,
**traverse_obj(file_element, {
'duration': ('duration', {functools.partial(int_or_none, scale=1000)}),
'duration': ('duration', {int_or_none(scale=1000)}),
'timestamp': ('file_start', {unified_timestamp}),
}),
})
@@ -253,12 +206,49 @@ class AfreecaTVIE(AfreecaTVBaseIE):
return self.playlist_result(entries, video_id, multi_video=True, **common_info)
class AfreecaTVLiveIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv:live'
IE_DESC = 'afreecatv.com livestreams'
_VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?'
class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
IE_NAME = 'soop:catchstory'
IE_DESC = 'sooplive.co.kr catch story'
_VALID_URL = r'https?://vod\.(?:sooplive\.co\.kr|afreecatv\.com)/player/(?P<id>\d+)/catchstory'
_TESTS = [{
'url': 'https://play.afreecatv.com/pyh3646/237852185',
'url': 'https://vod.sooplive.co.kr/player/103247/catchstory',
'info_dict': {
'id': '103247',
},
'playlist_count': 2,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._call_api(
'catchstory/a/view', video_id, headers={'Referer': url},
query={'aStoryListIdx': '', 'nStoryIdx': video_id})
return self.playlist_result(self._entries(data), video_id)
@staticmethod
def _entries(data):
# 'files' is always a list with 1 element
yield from traverse_obj(data, (
'data', lambda _, v: v['story_type'] == 'catch',
'catch_list', lambda _, v: v['files'][0]['file'], {
'id': ('files', 0, 'file_info_key', {str}),
'url': ('files', 0, 'file', {url_or_none}),
'duration': ('files', 0, 'duration', {int_or_none(scale=1000)}),
'title': ('title', {str}),
'uploader': ('writer_nick', {str}),
'uploader_id': ('writer_id', {str}),
'thumbnail': ('thumb', {url_or_none}),
'timestamp': ('write_timestamp', {int_or_none}),
}))
class AfreecaTVLiveIE(AfreecaTVBaseIE):
IE_NAME = 'soop:live'
IE_DESC = 'sooplive.co.kr livestreams'
_VALID_URL = r'https?://play\.(?:sooplive\.co\.kr|afreecatv\.com)/(?P<id>[^/?#]+)(?:/(?P<bno>\d+))?'
_TESTS = [{
'url': 'https://play.sooplive.co.kr/pyh3646/237852185',
'info_dict': {
'id': '237852185',
'ext': 'mp4',
@@ -270,30 +260,30 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
},
'skip': 'Livestream has ended',
}, {
'url': 'https://play.afreecatv.com/pyh3646/237852185',
'url': 'https://play.sooplive.co.kr/pyh3646/237852185',
'only_matching': True,
}, {
'url': 'https://play.afreecatv.com/pyh3646',
'url': 'https://play.sooplive.co.kr/pyh3646',
'only_matching': True,
}]
_LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php'
_LIVE_API_URL = 'https://live.sooplive.co.kr/afreeca/player_live_api.php'
_WORKING_CDNS = [
'gcp_cdn', # live-global-cdn-v02.afreecatv.com
'gs_cdn_pc_app', # pc-app.stream.afreecatv.com
'gs_cdn_mobile_web', # mobile-web.stream.afreecatv.com
'gs_cdn_pc_web', # pc-web.stream.afreecatv.com
'gcp_cdn', # live-global-cdn-v02.sooplive.co.kr
'gs_cdn_pc_app', # pc-app.stream.sooplive.co.kr
'gs_cdn_mobile_web', # mobile-web.stream.sooplive.co.kr
'gs_cdn_pc_web', # pc-web.stream.sooplive.co.kr
]
_BAD_CDNS = [
'gs_cdn', # chromecast.afreeca.gscdn.com (cannot resolve)
'gs_cdn_chromecast', # chromecast.stream.afreecatv.com (HTTP Error 400)
'azure_cdn', # live-global-cdn-v01.afreecatv.com (cannot resolve)
'aws_cf', # live-global-cdn-v03.afreecatv.com (cannot resolve)
'kt_cdn', # kt.stream.afreecatv.com (HTTP Error 400)
'gs_cdn_chromecast', # chromecast.stream.sooplive.co.kr (HTTP Error 400)
'azure_cdn', # live-global-cdn-v01.sooplive.co.kr (cannot resolve)
'aws_cf', # live-global-cdn-v03.sooplive.co.kr (cannot resolve)
'kt_cdn', # kt.stream.sooplive.co.kr (HTTP Error 400)
]
def _extract_formats(self, channel_info, broadcast_no, aid):
stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.sooplive.co.kr'
# If user has not passed CDN IDs, try API-provided CDN ID followed by other working CDN IDs
default_cdn_ids = orderedSet([
@@ -313,7 +303,7 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
try:
return self._extract_m3u8_formats(
m3u8_url, broadcast_no, 'mp4', m3u8_id='hls', query={'aid': aid},
headers={'Referer': 'https://play.afreecatv.com/'})
headers={'Referer': 'https://play.sooplive.co.kr/'})
except ExtractorError as e:
if attempt == len(cdn_ids):
raise
@@ -329,7 +319,13 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
broadcaster_id = channel_info.get('BJID') or broadcaster_id
broadcast_no = channel_info.get('BNO') or broadcast_no
if not broadcast_no:
raise UserNotLive(video_id=broadcaster_id)
result = channel_info.get('RESULT')
if result == 0:
raise UserNotLive(video_id=broadcaster_id)
elif result == -6:
self.raise_login_required(
'This channel is streaming for subscribers only', method='password')
raise ExtractorError('Unable to extract broadcast number')
password = self.get_param('videopassword')
if channel_info.get('BPWD') == 'Y' and password is None:
@@ -358,7 +354,7 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
formats = self._extract_formats(channel_info, broadcast_no, aid)
station_info = traverse_obj(self._download_json(
'https://st.afreecatv.com/api/get_station_status.php', broadcast_no,
'https://st.sooplive.co.kr/api/get_station_status.php', broadcast_no,
'Downloading channel metadata', 'Unable to download channel metadata',
query={'szBjId': broadcaster_id}, fatal=False), {dict}) or {}
@@ -374,11 +370,11 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE):
}
class AfreecaTVUserIE(InfoExtractor):
IE_NAME = 'afreecatv:user'
_VALID_URL = r'https?://bj\.afreeca(?:tv)?\.com/(?P<id>[^/]+)/vods/?(?P<slug_type>[^/]+)?'
class AfreecaTVUserIE(AfreecaTVBaseIE):
IE_NAME = 'soop:user'
_VALID_URL = r'https?://ch\.(?:sooplive\.co\.kr|afreecatv\.com)/(?P<id>[^/?#]+)/vods/?(?P<slug_type>[^/?#]+)?'
_TESTS = [{
'url': 'https://bj.afreecatv.com/ryuryu24/vods/review',
'url': 'https://ch.sooplive.co.kr/ryuryu24/vods/review',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
@@ -386,7 +382,7 @@ class AfreecaTVUserIE(InfoExtractor):
},
'playlist_count': 218,
}, {
'url': 'https://bj.afreecatv.com/parang1995/vods/highlight',
'url': 'https://ch.sooplive.co.kr/parang1995/vods/highlight',
'info_dict': {
'_type': 'playlist',
'id': 'parang1995',
@@ -394,7 +390,7 @@ class AfreecaTVUserIE(InfoExtractor):
},
'playlist_count': 997,
}, {
'url': 'https://bj.afreecatv.com/ryuryu24/vods',
'url': 'https://ch.sooplive.co.kr/ryuryu24/vods',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
@@ -402,7 +398,7 @@ class AfreecaTVUserIE(InfoExtractor):
},
'playlist_count': 221,
}, {
'url': 'https://bj.afreecatv.com/ryuryu24/vods/balloonclip',
'url': 'https://ch.sooplive.co.kr/ryuryu24/vods/balloonclip',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
@@ -414,12 +410,12 @@ class AfreecaTVUserIE(InfoExtractor):
def _fetch_page(self, user_id, user_type, page):
page += 1
info = self._download_json(f'https://bjapi.afreecatv.com/api/{user_id}/vods/{user_type}', user_id,
info = self._download_json(f'https://chapi.sooplive.co.kr/api/{user_id}/vods/{user_type}', user_id,
query={'page': page, 'per_page': self._PER_PAGE, 'orderby': 'reg_date'},
note=f'Downloading {user_type} video page {page}')
for item in info['data']:
yield self.url_result(
f'https://vod.afreecatv.com/player/{item["title_no"]}/', AfreecaTVIE, item['title_no'])
f'https://vod.sooplive.co.kr/player/{item["title_no"]}/', AfreecaTVIE, item['title_no'])
def _real_extract(self, url):
user_id, user_type = self._match_valid_url(url).group('id', 'slug_type')

View File

@@ -71,7 +71,7 @@ class AllstarBaseIE(InfoExtractor):
'thumbnails': (('clipImageThumb', 'clipImageSource'), {'url': {media_url_or_none}}),
'duration': ('clipLength', {int_or_none}),
'filesize': ('clipSizeBytes', {int_or_none}),
'timestamp': ('createdDate', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('createdDate', {int_or_none(scale=1000)}),
'uploader': ('username', {str}),
'uploader_id': ('user', '_id', {str}),
'view_count': ('views', {int_or_none}),

View File

@@ -33,24 +33,6 @@ class AnvatoIE(InfoExtractor):
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js
_TESTS = [{
# from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14
'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441',
'md5': '921919dab3cd0b849ff3d624831ae3e2',
'info_dict': {
'id': '899441',
'ext': 'mp4',
'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14',
'description': 'md5:85e05a3cc163f8c344340f220521136d',
'upload_date': '20201215',
'timestamp': 1608009755,
'thumbnail': r're:^https?://.*\.jpg',
'uploader': 'NFL',
'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights',
'Player Highlights', 'Cleveland Browns', 'league'],
'duration': 157,
'categories': ['Entertainment', 'Game', 'Highlights'],
},
}, {
# from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/
'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455',
'md5': '837718bcfb3a7778d022f857f7a9b19e',
@@ -241,31 +223,6 @@ class AnvatoIE(InfoExtractor):
'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582',
}
def _generate_nfl_token(self, anvack, mcp_id):
reroute = self._download_json(
'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials',
headers={'X-Domain-Id': 100}, note='Fetching token info')
token_type = reroute.get('token_type') or 'Bearer'
auth_token = f'{token_type} {reroute["access_token"]}'
response = self._download_json(
'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
'query': '''{
viewer {
mediaToken(anvack: "%s", id: %s) {
token
}
}
}''' % (anvack, mcp_id), # noqa: UP031
}).encode(), headers={
'Authorization': auth_token,
'Content-Type': 'application/json',
}, note='Fetching NFL API token')
return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token'))
_TOKEN_GENERATORS = {
'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token,
}
def _server_time(self, access_key, video_id):
return int_or_none(traverse_obj(self._download_json(
f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key},
@@ -290,8 +247,6 @@ class AnvatoIE(InfoExtractor):
}
if extracted_token is not None:
api['anvstk2'] = extracted_token
elif self._TOKEN_GENERATORS.get(access_key) is not None:
api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id)
elif self._ANVACK_TABLE.get(access_key) is not None:
api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}')
else:

View File

@@ -1,27 +1,42 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
clean_podcast_url,
get_element_by_class,
int_or_none,
parse_iso8601,
try_get,
)
from ..utils.traversal import traverse_obj
class ApplePodcastsIE(InfoExtractor):
_VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
_TESTS = [{
'url': 'https://podcasts.apple.com/us/podcast/ferreck-dawn-to-the-break-of-dawn-117/id1625658232?i=1000665010654',
'md5': '82cc219b8cc1dcf8bfc5a5e99b23b172',
'info_dict': {
'id': '1000665010654',
'ext': 'mp3',
'title': 'Ferreck Dawn - To The Break of Dawn 117',
'episode': 'Ferreck Dawn - To The Break of Dawn 117',
'description': 'md5:1fc571102f79dbd0a77bfd71ffda23bc',
'upload_date': '20240812',
'timestamp': 1723449600,
'duration': 3596,
'series': 'Ferreck Dawn - To The Break of Dawn',
'thumbnail': 're:.+[.](png|jpe?g|webp)',
},
}, {
'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
'md5': '41dc31cd650143e530d9423b6b5a344f',
'md5': 'baf8a6b8b8aa6062dbb4639ed73d0052',
'info_dict': {
'id': '1000482637777',
'ext': 'mp3',
'title': '207 - Whitney Webb Returns',
'episode': '207 - Whitney Webb Returns',
'episode_number': 207,
'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
'upload_date': '20200705',
'timestamp': 1593932400,
'duration': 6454,
'duration': 5369,
'series': 'The Tim Dillon Show',
'thumbnail': 're:.+[.](png|jpe?g|webp)',
},
@@ -39,47 +54,24 @@ class ApplePodcastsIE(InfoExtractor):
def _real_extract(self, url):
episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id)
episode_data = {}
ember_data = {}
# new page type 2021-11
amp_data = self._parse_json(self._search_regex(
r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
amp_data = try_get(amp_data,
lambda a: self._parse_json(
next(a[x] for x in iter(a) if episode_id in x),
episode_id),
dict) or {}
amp_data = amp_data.get('d') or []
episode_data = try_get(
amp_data,
lambda a: next(x for x in a
if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
dict)
if not episode_data:
# try pre 2021-11 page type: TODO: consider deleting if no longer used
ember_data = self._parse_json(self._search_regex(
r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
webpage, 'ember data'), episode_id) or {}
ember_data = ember_data.get(episode_id) or ember_data
episode_data = try_get(ember_data, lambda x: x['data'], dict)
episode = episode_data['attributes']
description = episode.get('description') or {}
series = None
for inc in (amp_data or ember_data.get('included') or []):
if inc.get('type') == 'media/podcast':
series = try_get(inc, lambda x: x['attributes']['name'])
series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
server_data = self._search_json(
r'<script [^>]*\bid=["\']serialized-server-data["\'][^>]*>', webpage,
'server data', episode_id, contains_pattern=r'\[{(?s:.+)}\]')[0]['data']
model_data = traverse_obj(server_data, (
'headerButtonItems', lambda _, v: v['$kind'] == 'bookmark' and v['modelType'] == 'EpisodeOffer',
'model', {dict}, any))
return {
'id': episode_id,
'title': episode.get('name'),
'url': clean_podcast_url(episode['assetUrl']),
'description': description.get('standard') or description.get('short'),
'timestamp': parse_iso8601(episode.get('releaseDateTime')),
'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
'series': series,
**self._json_ld(
traverse_obj(server_data, ('seoData', 'schemaContent', {dict}))
or self._yield_json_ld(webpage, episode_id, fatal=False), episode_id, fatal=False),
**traverse_obj(model_data, {
'title': ('title', {str}),
'url': ('streamUrl', {clean_podcast_url}),
'timestamp': ('releaseDate', {parse_iso8601}),
'duration': ('duration', {int_or_none}),
}),
'thumbnail': self._og_search_thumbnail(webpage),
'vcodec': 'none',
}

View File

@@ -4,6 +4,7 @@ from .common import InfoExtractor
from ..utils import (
extract_attributes,
int_or_none,
join_nonempty,
parse_iso8601,
try_get,
)
@@ -136,7 +137,7 @@ class ArcPublishingIE(InfoExtractor):
else:
vbr = int_or_none(s.get('bitrate'))
formats.append({
'format_id': f'{stream_type}-{vbr}' if vbr else stream_type,
'format_id': join_nonempty(stream_type, vbr),
'vbr': vbr,
'width': int_or_none(s.get('width')),
'height': int_or_none(s.get('height')),

View File

@@ -231,7 +231,7 @@ class ARDIE(InfoExtractor):
class ARDBetaMediathekIE(InfoExtractor):
IE_NAME = 'ARDMediathek'
_VALID_URL = r'''(?x)https://
_VALID_URL = r'''(?x)https?://
(?:(?:beta|www)\.)?ardmediathek\.de/
(?:[^/]+/)?
(?:player|live|video)/
@@ -299,7 +299,7 @@ class ARDBetaMediathekIE(InfoExtractor):
'info_dict': {
'id': '94834686',
'ext': 'mp4',
'duration': 2700,
'duration': 2670,
'episode': '7 Tage ... unter harten Jungs',
'description': 'md5:0f215470dcd2b02f59f4bd10c963f072',
'upload_date': '20231005',
@@ -307,10 +307,28 @@ class ARDBetaMediathekIE(InfoExtractor):
'display_id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
'series': '7 Tage ...',
'channel': 'HR',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:430c86d233afa42d?w=960&ch=fa32ba69bc87989a',
'title': '7 Tage ... unter harten Jungs',
'_old_archive_ids': ['ardbetamediathek N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3'],
},
}, {
'url': 'https://www.ardmediathek.de/video/lokalzeit-aus-duesseldorf/lokalzeit-aus-duesseldorf-oder-31-10-2024/wdr-duesseldorf/Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz',
'info_dict': {
'id': '13847165',
'chapters': 'count:8',
'ext': 'mp4',
'channel': 'WDR',
'display_id': 'Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz',
'episode': 'Lokalzeit aus Düsseldorf | 31.10.2024',
'series': 'Lokalzeit aus Düsseldorf',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f02ec9bd9b7bd5f6?w=960&ch=612491dcd5e09b0c',
'title': 'Lokalzeit aus Düsseldorf | 31.10.2024',
'upload_date': '20241031',
'timestamp': 1730399400,
'description': 'md5:12db30b3b706314efe3778b8df1a7058',
'duration': 1759,
'_old_archive_ids': ['ardbetamediathek Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz'],
},
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'only_matching': True,
@@ -455,6 +473,12 @@ class ARDBetaMediathekIE(InfoExtractor):
'subtitles': subtitles,
'is_live': is_live,
'age_limit': age_limit,
**traverse_obj(media_data, {
'chapters': ('pluginData', 'jumpmarks@all', 'chapterArray', lambda _, v: int_or_none(v['chapterTime']), {
'start_time': ('chapterTime', {int_or_none}),
'title': ('chapterTitle', {str}),
}),
}),
**traverse_obj(media_data, ('meta', {
'title': 'title',
'description': 'synopsis',
@@ -470,7 +494,7 @@ class ARDBetaMediathekIE(InfoExtractor):
class ARDMediathekCollectionIE(InfoExtractor):
_VALID_URL = r'''(?x)https://
_VALID_URL = r'''(?x)https?://
(?:(?:beta|www)\.)?ardmediathek\.de/
(?:[^/?#]+/)?
(?P<playlist>sendung|serie|sammlung)/

View File

@@ -131,8 +131,8 @@ class ArkenaIE(InfoExtractor):
formats.extend(self._extract_f4m_formats(
href, video_id, f4m_id='hds', fatal=False))
elif mime_type == 'application/dash+xml':
formats.extend(self._extract_f4m_formats(
href, video_id, f4m_id='hds', fatal=False))
formats.extend(self._extract_mpd_formats(
href, video_id, mpd_id='dash', fatal=False))
elif mime_type == 'application/vnd.ms-sstr+xml':
formats.extend(self._extract_ism_formats(
href, video_id, ism_id='mss', fatal=False))

View File

@@ -101,9 +101,10 @@ class AsobiStageIE(InfoExtractor):
self._HEADERS['Authorization'] = f'Bearer {token}'
def _real_extract(self, url):
video_id, event, type_, slug = self._match_valid_url(url).group('id', 'event', 'type', 'slug')
webpage, urlh = self._download_webpage_handle(url, self._match_id(url))
video_id, event, type_, slug = self._match_valid_url(urlh.url).group('id', 'event', 'type', 'slug')
video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
webpage = self._download_webpage(url, video_id)
event_data = traverse_obj(
self._search_nextjs_data(webpage, video_id, default={}),
('props', 'pageProps', 'eventCMSData', {

View File

@@ -33,14 +33,6 @@ class AtresPlayerIE(InfoExtractor):
]
_API_BASE = 'https://api.atresplayer.com/'
def _handle_error(self, e, code):
if isinstance(e.cause, HTTPError) and e.cause.status == code:
error = self._parse_json(e.cause.response.read(), None)
if error.get('error') == 'required_registered':
self.raise_login_required()
raise ExtractorError(error['error_description'], expected=True)
raise
def _perform_login(self, username, password):
self._request_webpage(
self._API_BASE + 'login', None, 'Downloading login page')
@@ -55,7 +47,9 @@ class AtresPlayerIE(InfoExtractor):
'password': password,
}))['targetUrl']
except ExtractorError as e:
self._handle_error(e, 400)
if isinstance(e.cause, HTTPError) and e.cause.status == 400:
raise ExtractorError('Invalid username and/or password', expected=True)
raise
self._request_webpage(target_url, None, 'Following Target URL')
@@ -66,7 +60,12 @@ class AtresPlayerIE(InfoExtractor):
episode = self._download_json(
self._API_BASE + 'client/v1/player/episode/' + video_id, video_id)
except ExtractorError as e:
self._handle_error(e, 403)
if isinstance(e.cause, HTTPError) and e.cause.status == 403:
error = self._parse_json(e.cause.response.read(), None)
if error.get('error') == 'required_registered':
self.raise_login_required()
raise ExtractorError(error['error_description'], expected=True)
raise
title = episode['titulo']

View File

@@ -4,9 +4,13 @@ import urllib.parse
from .common import InfoExtractor
from ..utils import (
InAdvancePagedList,
determine_ext,
format_field,
int_or_none,
join_nonempty,
traverse_obj,
unified_timestamp,
url_or_none,
)
@@ -30,6 +34,7 @@ class BanByeBaseIE(InfoExtractor):
class BanByeIE(BanByeBaseIE):
_VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?watch/(?P<id>[\w-]+)'
_TESTS = [{
# ['src']['mp4']['levels'] direct mp4 urls only
'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T',
'md5': '2f4ea15c5ca259a73d909b2cfd558eb5',
'info_dict': {
@@ -58,6 +63,7 @@ class BanByeIE(BanByeBaseIE):
},
'playlist_mincount': 9,
}, {
# ['src']['mp4']['levels'] direct mp4 urls only
'url': 'https://banbye.com/watch/v_kb6_o1Kyq-CD',
'info_dict': {
'id': 'v_kb6_o1Kyq-CD',
@@ -77,6 +83,48 @@ class BanByeIE(BanByeBaseIE):
'view_count': int,
'comment_count': int,
},
}, {
# ['src']['hls']['levels'] variant m3u8 urls only; master m3u8 is 404
'url': 'https://banbye.com/watch/v_a_gPFuC9LoW5',
'info_dict': {
'id': 'v_a_gPFuC9LoW5',
'ext': 'mp4',
'title': 'md5:183524056bebdfa245fd6d214f63c0fe',
'description': 'md5:943ac87287ca98d28d8b8797719827c6',
'uploader': 'wRealu24',
'channel_id': 'ch_wrealu24',
'channel_url': 'https://banbye.com/channel/ch_wrealu24',
'upload_date': '20231113',
'timestamp': 1699874062,
'view_count': int,
'like_count': int,
'dislike_count': int,
'comment_count': int,
'thumbnail': 'https://cdn.banbye.com/video/v_a_gPFuC9LoW5/96.webp',
'tags': ['jaszczur', 'sejm', 'lewica', 'polska', 'ukrainizacja', 'pierwszeposiedzeniesejmu'],
},
'expected_warnings': ['Failed to download m3u8'],
}, {
# ['src']['hls']['masterPlaylist'] m3u8 only
'url': 'https://banbye.com/watch/v_B0rsKWsr-aaa',
'info_dict': {
'id': 'v_B0rsKWsr-aaa',
'ext': 'mp4',
'title': 'md5:00b254164b82101b3f9e5326037447ed',
'description': 'md5:3fd8b48aa81954ba024bc60f5de6e167',
'uploader': 'PSTV Piotr Szlachtowicz ',
'channel_id': 'ch_KV9EVObkB9wB',
'channel_url': 'https://banbye.com/channel/ch_KV9EVObkB9wB',
'upload_date': '20240629',
'timestamp': 1719646816,
'duration': 2377,
'view_count': int,
'like_count': int,
'dislike_count': int,
'comment_count': int,
'thumbnail': 'https://cdn.banbye.com/video/v_B0rsKWsr-aaa/96.webp',
'tags': ['Biden', 'Trump', 'Wybory', 'USA'],
},
}]
def _real_extract(self, url):
@@ -91,11 +139,24 @@ class BanByeIE(BanByeBaseIE):
'id': f'{quality}p',
'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp',
} for quality in [48, 96, 144, 240, 512, 1080]]
formats = [{
'format_id': f'http-{quality}p',
'quality': quality,
'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4',
} for quality in data['quality']]
formats = []
url_data = self._download_json(f'{self._API_BASE}/videos/{video_id}/url', video_id, data=b'')
if master_url := traverse_obj(url_data, ('src', 'hls', 'masterPlaylist', {url_or_none})):
formats = self._extract_m3u8_formats(master_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
for format_id, format_url in traverse_obj(url_data, (
'src', ('mp4', 'hls'), 'levels', {dict.items}, lambda _, v: url_or_none(v[1]))):
ext = determine_ext(format_url)
is_hls = ext == 'm3u8'
formats.append({
'url': format_url,
'ext': 'mp4' if is_hls else ext,
'format_id': join_nonempty(is_hls and 'hls', format_id),
'protocol': 'm3u8_native' if is_hls else 'https',
'height': int_or_none(format_id),
})
self._remove_duplicate_formats(formats)
return {
'id': video_id,

View File

@@ -1,3 +1,4 @@
import json
import random
import re
import time
@@ -6,6 +7,7 @@ from .common import InfoExtractor
from ..utils import (
KNOWN_EXTENSIONS,
ExtractorError,
extract_attributes,
float_or_none,
int_or_none,
parse_filesize,
@@ -17,6 +19,7 @@ from ..utils import (
url_or_none,
urljoin,
)
from ..utils.traversal import find_element, traverse_obj
class BandcampIE(InfoExtractor):
@@ -40,8 +43,10 @@ class BandcampIE(InfoExtractor):
'uploader_url': 'https://youtube-dl.bandcamp.com',
'uploader_id': 'youtube-dl',
'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
'artists': ['youtube-dl "\'/\\ä↭'],
'album_artists': ['youtube-dl "\'/\\ä↭'],
},
'_skip': 'There is a limit of 200 free downloads / month for the test song',
'skip': 'There is a limit of 200 free downloads / month for the test song',
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
@@ -266,6 +271,18 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
'timestamp': 1311756226,
'upload_date': '20110727',
'uploader': 'Blazo',
'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
'album_artists': ['Blazo'],
'uploader_url': 'https://blazo.bandcamp.com',
'release_date': '20110727',
'release_timestamp': 1311724800.0,
'track': 'Intro',
'uploader_id': 'blazo',
'track_number': 1,
'album': 'Jazz Format Mixtape vol.1',
'artists': ['Blazo'],
'duration': 19.335,
'track_id': '1353101989',
},
},
{
@@ -277,6 +294,18 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
'timestamp': 1311757238,
'upload_date': '20110727',
'uploader': 'Blazo',
'track': 'Kero One - Keep It Alive (Blazo remix)',
'release_date': '20110727',
'track_id': '38097443',
'track_number': 2,
'duration': 181.467,
'uploader_url': 'https://blazo.bandcamp.com',
'album': 'Jazz Format Mixtape vol.1',
'uploader_id': 'blazo',
'album_artists': ['Blazo'],
'artists': ['Blazo'],
'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
'release_timestamp': 1311724800.0,
},
},
],
@@ -284,6 +313,7 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
'title': 'Jazz Format Mixtape vol.1',
'id': 'jazz-format-mixtape-vol-1',
'uploader_id': 'blazo',
'description': 'md5:38052a93217f3ffdc033cd5dbbce2989',
},
'params': {
'playlistend': 2,
@@ -358,10 +388,10 @@ class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
'url': 'https://bandcamp.com/?show=224',
'md5': 'b00df799c733cf7e0c567ed187dea0fd',
'md5': '61acc9a002bed93986b91168aa3ab433',
'info_dict': {
'id': '224',
'ext': 'opus',
'ext': 'mp3',
'title': 'BC Weekly April 4th 2017 - Magic Moments',
'description': 'md5:5d48150916e8e02d030623a48512c874',
'duration': 5829.77,
@@ -371,7 +401,7 @@ class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
'episode_id': '224',
},
'params': {
'format': 'opus-lo',
'format': 'mp3-128',
},
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
@@ -459,7 +489,7 @@ class BandcampUserIE(InfoExtractor):
},
}, {
'url': 'https://coldworldofficial.bandcamp.com/music',
'playlist_mincount': 10,
'playlist_mincount': 7,
'info_dict': {
'id': 'coldworldofficial',
'title': 'Discography of coldworldofficial',
@@ -473,12 +503,19 @@ class BandcampUserIE(InfoExtractor):
},
}]
def _yield_items(self, webpage):
yield from (
re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
yield from traverse_obj(webpage, (
{find_element(id='music-grid', html=True)}, {extract_attributes},
'data-client-items', {json.loads}, ..., 'page_url', {str}))
def _real_extract(self, url):
uploader = self._match_id(url)
webpage = self._download_webpage(url, uploader)
discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
return self.playlist_from_matches(
discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x))
self._yield_items(webpage), uploader, f'Discography of {uploader}',
getter=urljoin(url))

View File

@@ -1284,9 +1284,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
'description': ('synopses', ('long', 'medium', 'short'), {str}, filter, any),
'duration': ('versions', 0, 'duration', {int}),
'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('versions', 0, 'availableFrom', {int_or_none(scale=1000)}),
}),
}
@@ -1386,7 +1386,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
'url': ('url', {url_or_none}),
'ext': ('format', {str}),
'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
'tbr': ('bitrate', {int_or_none(scale=1000)}),
}))
if formats:
entry = {
@@ -1398,7 +1398,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('firstPublished', {int_or_none(scale=1000)}),
}),
}
done = True
@@ -1428,7 +1428,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
if not entry.get('timestamp'):
entry['timestamp'] = traverse_obj(next_data, (
..., 'contents', is_type('timestamp'), 'model',
'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
'timestamp', {int_or_none(scale=1000)}, any))
entries.append(entry)
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)

View File

@@ -0,0 +1,68 @@
import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_iso8601,
traverse_obj,
)
class BeaconTvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beacon\.tv/content/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://beacon.tv/content/welcome-to-beacon',
'md5': 'b3f5932d437f288e662f10f3bfc5bd04',
'info_dict': {
'id': 'welcome-to-beacon',
'ext': 'mp4',
'upload_date': '20240509',
'description': 'md5:ea2bd32e71acf3f9fca6937412cc3563',
'thumbnail': 'https://cdn.jwplayer.com/v2/media/I4CkkEvN/poster.jpg?width=720',
'title': 'Your home for Critical Role!',
'timestamp': 1715227200,
'duration': 105.494,
},
}, {
'url': 'https://beacon.tv/content/re-slayers-take-trailer',
'md5': 'd879b091485dbed2245094c8152afd89',
'info_dict': {
'id': 're-slayers-take-trailer',
'ext': 'mp4',
'title': 'The Re-Slayers Take | Official Trailer',
'timestamp': 1715189040,
'upload_date': '20240508',
'duration': 53.249,
'thumbnail': 'https://cdn.jwplayer.com/v2/media/PW5ApIw3/poster.jpg?width=720',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
content_data = traverse_obj(self._search_nextjs_data(webpage, video_id), (
'props', 'pageProps', '__APOLLO_STATE__',
lambda k, v: k.startswith('Content:') and v['slug'] == video_id, any))
if not content_data:
raise ExtractorError('Failed to extract content data')
jwplayer_data = traverse_obj(content_data, (
(('contentVideo', 'video', 'videoData'),
('contentPodcast', 'podcast', 'audioData')), {json.loads}, {dict}, any))
if not jwplayer_data:
if content_data.get('contentType') not in ('videoPodcast', 'video', 'podcast'):
raise ExtractorError('Content is not a video/podcast', expected=True)
if traverse_obj(content_data, ('contentTier', '__ref')) != 'MemberTier:65b258d178f89be87b4dc0a4':
self.raise_login_required('This video/podcast is for members only')
raise ExtractorError('Failed to extract content')
return {
**self._parse_jwplayer_data(jwplayer_data, video_id),
**traverse_obj(content_data, {
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('publishedAt', {parse_iso8601}),
}),
}

View File

@@ -1,18 +1,33 @@
import re
from .common import InfoExtractor
from ..utils import extract_attributes
from ..utils import ExtractorError, extract_attributes
class BFMTVBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/'
_VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html'
_VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block[^"]*"[^>]*>)'
_VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block[^"]*"[^>]*>.*?</div>)'
_VIDEO_ELEMENT_REGEX = r'(<video-js[^>]+>)'
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
def _brightcove_url_result(self, video_id, video_block):
account_id = video_block.get('accountid') or '876450612001'
player_id = video_block.get('playerid') or 'I2qBTln4u'
def _extract_video(self, video_block):
video_element = self._search_regex(
self._VIDEO_ELEMENT_REGEX, video_block, 'video element', default=None)
if video_element:
video_element_attrs = extract_attributes(video_element)
video_id = video_element_attrs.get('data-video-id')
if not video_id:
return
account_id = video_element_attrs.get('data-account') or '876450610001'
player_id = video_element_attrs.get('adjustplayer') or '19dszYXgm'
else:
video_block_attrs = extract_attributes(video_block)
video_id = video_block_attrs.get('videoid')
if not video_id:
return
account_id = video_block_attrs.get('accountid') or '876630703001'
player_id = video_block_attrs.get('playerid') or 'KbPwEbuHx'
return self.url_result(
self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id),
'BrightcoveNew', video_id)
@@ -40,23 +55,25 @@ class BFMTVIE(BFMTVBaseIE):
def _real_extract(self, url):
bfmtv_id = self._match_id(url)
webpage = self._download_webpage(url, bfmtv_id)
video_block = extract_attributes(self._search_regex(
video = self._extract_video(self._search_regex(
self._VIDEO_BLOCK_REGEX, webpage, 'video block'))
return self._brightcove_url_result(video_block['videoid'], video_block)
if not video:
raise ExtractorError('Failed to extract video')
return video
class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE
class BFMTVLiveIE(BFMTVBaseIE):
IE_NAME = 'bfmtv:live'
_VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)'
_TESTS = [{
'url': 'https://www.bfmtv.com/en-direct/',
'info_dict': {
'id': '5615950982001',
'id': '6346069778112',
'ext': 'mp4',
'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'title': r're:^Le Live BFM TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'uploader_id': '876450610001',
'upload_date': '20220926',
'timestamp': 1664207191,
'upload_date': '20240202',
'timestamp': 1706887572,
'live_status': 'is_live',
'thumbnail': r're:https://.+/image\.jpg',
'tags': [],
@@ -69,6 +86,15 @@ class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE
'only_matching': True,
}]
def _real_extract(self, url):
bfmtv_id = self._match_id(url)
webpage = self._download_webpage(url, bfmtv_id)
video = self._extract_video(self._search_regex(
self._VIDEO_BLOCK_REGEX, webpage, 'video block'))
if not video:
raise ExtractorError('Failed to extract video')
return video
class BFMTVArticleIE(BFMTVBaseIE):
IE_NAME = 'bfmtv:article'
@@ -102,18 +128,16 @@ class BFMTVArticleIE(BFMTVBaseIE):
},
}]
def _entries(self, webpage):
for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage):
video = self._extract_video(video_block_el)
if video:
yield video
def _real_extract(self, url):
bfmtv_id = self._match_id(url)
webpage = self._download_webpage(url, bfmtv_id)
entries = []
for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage):
video_block = extract_attributes(video_block_el)
video_id = video_block.get('videoid')
if not video_id:
continue
entries.append(self._brightcove_url_result(video_id, video_block))
return self.playlist_result(
entries, bfmtv_id, self._og_search_title(webpage, fatal=False),
self._entries(webpage), bfmtv_id, self._og_search_title(webpage, fatal=False),
self._html_search_meta(['og:description', 'description'], webpage))

View File

@@ -1,4 +1,3 @@
import functools
from .common import InfoExtractor
from ..utils import (
@@ -50,7 +49,7 @@ class BibelTVBaseIE(InfoExtractor):
**traverse_obj(data, {
'title': 'title',
'description': 'description',
'duration': ('duration', {functools.partial(int_or_none, scale=1000)}),
'duration': ('duration', {int_or_none(scale=1000)}),
'timestamp': ('schedulingStart', {parse_iso8601}),
'season_number': 'seasonNumber',
'episode_number': 'episodeNumber',

View File

@@ -31,12 +31,12 @@ from ..utils import (
mimetype2ext,
parse_count,
parse_qs,
parse_resolution,
qualities,
smuggle_url,
srt_subtitles_timecode,
str_or_none,
traverse_obj,
try_call,
unified_timestamp,
unsmuggle_url,
url_or_none,
@@ -46,7 +46,25 @@ from ..utils import (
class BilibiliBaseIE(InfoExtractor):
_HEADERS = {'Referer': 'https://www.bilibili.com/'}
_FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?')
_WBI_KEY_CACHE_TIMEOUT = 30 # exact expire timeout is unclear, use 30s for one session
_wbi_key_cache = {}
@property
def is_logged_in(self):
return bool(self._get_cookies('https://api.bilibili.com').get('SESSDATA'))
def _check_missing_formats(self, play_info, formats):
parsed_qualities = set(traverse_obj(formats, (..., 'quality')))
missing_formats = join_nonempty(*[
traverse_obj(fmt, 'new_description', 'display_desc', 'quality')
for fmt in traverse_obj(play_info, (
'support_formats', lambda _, v: v['quality'] not in parsed_qualities))], delim=', ')
if missing_formats:
self.to_screen(
f'Format(s) {missing_formats} are missing; you have to login or '
f'become a premium member to download them. {self._login_hint()}')
def extract_formats(self, play_info):
format_names = {
@@ -86,18 +104,75 @@ class BilibiliBaseIE(InfoExtractor):
'format': format_names.get(video.get('id')),
} for video in traverse_obj(play_info, ('dash', 'video', ...)))
missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality')))
if missing_formats:
self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; '
f'you have to login or become premium member to download them. {self._login_hint()}')
if formats:
self._check_missing_formats(play_info, formats)
fragments = traverse_obj(play_info, ('durl', lambda _, v: url_or_none(v['url']), {
'url': ('url', {url_or_none}),
'duration': ('length', {float_or_none(scale=1000)}),
'filesize': ('size', {int_or_none}),
}))
if fragments:
formats.append({
'url': fragments[0]['url'],
'filesize': sum(traverse_obj(fragments, (..., 'filesize'))),
**({
'fragments': fragments,
'protocol': 'http_dash_segments',
} if len(fragments) > 1 else {}),
**traverse_obj(play_info, {
'quality': ('quality', {int_or_none}),
'format_id': ('quality', {str_or_none}),
'format_note': ('quality', {lambda x: format_names.get(x)}),
'duration': ('timelength', {float_or_none(scale=1000)}),
}),
**parse_resolution(format_names.get(play_info.get('quality'))),
})
return formats
def _download_playinfo(self, video_id, cid, headers=None):
def _get_wbi_key(self, video_id):
if time.time() < self._wbi_key_cache.get('ts', 0) + self._WBI_KEY_CACHE_TIMEOUT:
return self._wbi_key_cache['key']
session_data = self._download_json(
'https://api.bilibili.com/x/web-interface/nav', video_id, note='Downloading wbi sign')
lookup = ''.join(traverse_obj(session_data, (
'data', 'wbi_img', ('img_url', 'sub_url'),
{lambda x: x.rpartition('/')[2].partition('.')[0]})))
# from getMixinKey() in the vendor js
mixin_key_enc_tab = [
46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,
33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,
61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,
36, 20, 34, 44, 52,
]
self._wbi_key_cache.update({
'key': ''.join(lookup[i] for i in mixin_key_enc_tab)[:32],
'ts': time.time(),
})
return self._wbi_key_cache['key']
def _sign_wbi(self, params, video_id):
params['wts'] = round(time.time())
params = {
k: ''.join(filter(lambda char: char not in "!'()*", str(v)))
for k, v in sorted(params.items())
}
query = urllib.parse.urlencode(params)
params['w_rid'] = hashlib.md5(f'{query}{self._get_wbi_key(video_id)}'.encode()).hexdigest()
return params
def _download_playinfo(self, bvid, cid, headers=None, qn=None):
params = {'bvid': bvid, 'cid': cid, 'fnval': 4048}
if qn:
params['qn'] = qn
return self._download_json(
'https://api.bilibili.com/x/player/playurl', video_id,
query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
note=f'Downloading video formats for cid {cid}', headers=headers)['data']
'https://api.bilibili.com/x/player/wbi/playurl', bvid,
query=self._sign_wbi(params, bvid), headers=headers,
note=f'Downloading video formats for cid {cid} {qn or ""}')['data']
def json2srt(self, json_data):
srt_data = ''
@@ -115,15 +190,15 @@ class BilibiliBaseIE(InfoExtractor):
}],
}
subtitle_info = traverse_obj(self._download_json(
video_info = self._download_json(
'https://api.bilibili.com/x/player/v2', video_id,
query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid},
note=f'Extracting subtitle info {cid}'), ('data', 'subtitle'))
subs_list = traverse_obj(subtitle_info, ('subtitles', lambda _, v: v['subtitle_url'] and v['lan']))
if not subs_list and traverse_obj(subtitle_info, 'allow_submit'):
if not self._get_cookies('https://api.bilibili.com').get('SESSDATA'): # no login session cookie
self.report_warning(f'CC subtitles (if any) are only visible when logged in. {self._login_hint()}', only_once=True)
for s in subs_list:
note=f'Extracting subtitle info {cid}', headers=self._HEADERS)
if traverse_obj(video_info, ('data', 'need_login_subtitle')):
self.report_warning(
f'Subtitles are only available when logged in. {self._login_hint()}', only_once=True)
for s in traverse_obj(video_info, (
'data', 'subtitle', 'subtitles', lambda _, v: v['subtitle_url'] and v['lan'])):
subtitles.setdefault(s['lan'], []).append({
'ext': 'srt',
'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)),
@@ -133,7 +208,7 @@ class BilibiliBaseIE(InfoExtractor):
def _get_chapters(self, aid, cid):
chapters = aid and cid and self._download_json(
'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid},
note='Extracting chapters', fatal=False)
note='Extracting chapters', fatal=False, headers=self._HEADERS)
return traverse_obj(chapters, ('data', 'view_points', ..., {
'title': 'content',
'start_time': 'from',
@@ -203,15 +278,15 @@ class BilibiliBaseIE(InfoExtractor):
self._get_divisions(video_id, graph_version, edges, choice['edge_id'], cid_edges=cid_edges)
return cid_edges
def _get_interactive_entries(self, video_id, cid, metainfo):
def _get_interactive_entries(self, video_id, cid, metainfo, headers=None):
graph_version = traverse_obj(
self._download_json(
'https://api.bilibili.com/x/player/wbi/v2', video_id,
'Extracting graph version', query={'bvid': video_id, 'cid': cid}),
'Extracting graph version', query={'bvid': video_id, 'cid': cid}, headers=headers),
('data', 'interaction', 'graph_version', {int_or_none}))
cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1)
for cid, edges in cid_edges.items():
play_info = self._download_playinfo(video_id, cid)
play_info = self._download_playinfo(video_id, cid, headers=headers)
yield {
**metainfo,
'id': f'{video_id}_{cid}',
@@ -224,7 +299,7 @@ class BilibiliBaseIE(InfoExtractor):
class BiliBiliIE(BilibiliBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/[^/?#]+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bilibili.com/video/BV13x41117TL',
@@ -243,17 +318,17 @@ class BiliBiliIE(BilibiliBaseIE):
'timestamp': 1488353834,
'like_count': int,
'view_count': int,
'_old_archive_ids': ['bilibili 8903802_part1'],
},
}, {
'note': 'old av URL version',
'url': 'http://www.bilibili.com/video/av1074402/',
'info_dict': {
'thumbnail': r're:^https?://.*\.(jpg|jpeg)$',
'id': 'BV11x411K7CN',
'ext': 'mp4',
'title': '【金坷垃】金泡沫',
'uploader': '菊子桑',
'uploader_id': '156160',
'id': 'BV11x411K7CN',
'title': '【金坷垃】金泡沫',
'duration': 308.36,
'upload_date': '20140420',
'timestamp': 1397983878,
@@ -262,6 +337,8 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': int,
'view_count': int,
'tags': list,
'thumbnail': r're:^https?://.*\.(jpg|jpeg)$',
'_old_archive_ids': ['bilibili 1074402_part1'],
},
'params': {'skip_download': True},
}, {
@@ -288,6 +365,7 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int,
'description': 'md5:e3c401cf7bc363118d1783dd74068a68',
'duration': 90.314,
'_old_archive_ids': ['bilibili 498159642_part1'],
},
}],
}, {
@@ -308,28 +386,8 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int,
'description': 'md5:e3c401cf7bc363118d1783dd74068a68',
'duration': 90.314,
'_old_archive_ids': ['bilibili 498159642_part1'],
},
}, {
'note': 'video has subtitles',
'url': 'https://www.bilibili.com/video/BV12N4y1M7rh',
'info_dict': {
'id': 'BV12N4y1M7rh',
'ext': 'mp4',
'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1',
'tags': list,
'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4',
'duration': 313.557,
'upload_date': '20220709',
'uploader': '小夫太渴',
'timestamp': 1657347907,
'uploader_id': '1326814124',
'comment_count': int,
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'subtitles': 'count:2',
},
'params': {'listsubtitles': True},
}, {
'url': 'https://www.bilibili.com/video/av8903802/',
'info_dict': {
@@ -347,6 +405,7 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': int,
'view_count': int,
'like_count': int,
'_old_archive_ids': ['bilibili 8903802_part1'],
},
'params': {
'skip_download': True,
@@ -370,6 +429,7 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 463665680_part1'],
},
'params': {'skip_download': True},
}, {
@@ -388,8 +448,8 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 893839363_part1'],
},
'params': {'skip_download': True},
}, {
'note': 'newer festival video',
'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f',
@@ -406,8 +466,57 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 778246196_part1'],
},
}, {
'note': 'legacy flv/mp4 video',
'url': 'https://www.bilibili.com/video/BV1ms411Q7vw/?p=4',
'info_dict': {
'id': 'BV1ms411Q7vw_p4',
'title': '[搞笑]【动画】云南方言快乐生产线出品 p04 新烧包谷之漫游桃花岛',
'timestamp': 1458222815,
'upload_date': '20160317',
'description': '云南方言快乐生产线出品',
'duration': float,
'uploader': '一笑颠天',
'uploader_id': '3916081',
'view_count': int,
'comment_count': int,
'like_count': int,
'tags': list,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 4120229_part4'],
},
'params': {'extractor_args': {'bilibili': {'prefer_multi_flv': ['32']}}},
'playlist_count': 19,
'playlist': [{
'info_dict': {
'id': 'BV1ms411Q7vw_p4_0',
'ext': 'flv',
'title': '[搞笑]【动画】云南方言快乐生产线出品 p04 新烧包谷之漫游桃花岛',
'duration': 399.102,
},
}],
}, {
'note': 'legacy mp4-only video',
'url': 'https://www.bilibili.com/video/BV1nx411u79K',
'info_dict': {
'id': 'BV1nx411u79K',
'ext': 'mp4',
'title': '【练习室】201603声乐练习《No Air》with VigoVan',
'timestamp': 1508893551,
'upload_date': '20171025',
'description': '@ZERO-G伯远\n声乐练习 《No Air》with Vigo Van',
'duration': 80.384,
'uploader': '伯远',
'uploader_id': '10584494',
'comment_count': int,
'view_count': int,
'like_count': int,
'tags': list,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 15700301_part1'],
},
'params': {'skip_download': True},
}, {
'note': 'interactive/split-path video',
'url': 'https://www.bilibili.com/video/BV1af4y1H7ga/',
@@ -425,6 +534,7 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 292734508_part1'],
},
'playlist_count': 33,
'playlist': [{
@@ -443,6 +553,7 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 292734508_part1'],
},
}],
}, {
@@ -465,6 +576,29 @@ class BiliBiliIE(BilibiliBaseIE):
'upload_date': '20191021',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
}, {
'note': 'video has subtitles, which requires login',
'url': 'https://www.bilibili.com/video/BV12N4y1M7rh',
'info_dict': {
'id': 'BV12N4y1M7rh',
'ext': 'mp4',
'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1',
'tags': list,
'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4',
'duration': 313.557,
'upload_date': '20220709',
'uploader': '小夫太渴',
'timestamp': 1657347907,
'uploader_id': '1326814124',
'comment_count': int,
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'subtitles': 'count:2', # login required for CC subtitle
'_old_archive_ids': ['bilibili 898179753_part1'],
},
'params': {'listsubtitles': True},
'skip': 'login required for subtitle',
}, {
'url': 'https://www.bilibili.com/video/BV1jL41167ZG/',
'info_dict': {
@@ -489,6 +623,10 @@ class BiliBiliIE(BilibiliBaseIE):
'ext': 'mp4',
},
'skip': 'geo-restricted',
}, {
'note': 'has - in the last path segment of the url',
'url': 'https://www.bilibili.com/festival/bh3-7th?bvid=BV1tr4y1f7p2&',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -498,8 +636,9 @@ class BiliBiliIE(BilibiliBaseIE):
if not self._match_valid_url(urlh.url):
return self.url_result(urlh.url)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
headers['Referer'] = url
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
is_festival = 'videoData' not in initial_state
if is_festival:
video_data = initial_state['videoInfo']
@@ -548,7 +687,6 @@ class BiliBiliIE(BilibiliBaseIE):
aid = video_data.get('aid')
old_video_id = format_field(aid, None, f'%s_part{part_id or 1}')
cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid')
festival_info = {}
@@ -586,18 +724,65 @@ class BiliBiliIE(BilibiliBaseIE):
is_interactive = traverse_obj(video_data, ('rights', 'is_stein_gate'))
if is_interactive:
return self.playlist_result(
self._get_interactive_entries(video_id, cid, metainfo), **metainfo,
self._get_interactive_entries(video_id, cid, metainfo, headers=headers), **metainfo,
duration=traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})),
__post_extractor=self.extract_comments(aid))
else:
return {
**metainfo,
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'chapters': self._get_chapters(aid, cid),
'subtitles': self.extract_subtitles(video_id, cid),
'formats': self.extract_formats(play_info),
'__post_extractor': self.extract_comments(aid),
}
formats = self.extract_formats(play_info)
if not traverse_obj(play_info, ('dash')):
# we only have legacy formats and need additional work
has_qn = lambda x: x in traverse_obj(formats, (..., 'quality'))
for qn in traverse_obj(play_info, ('accept_quality', lambda _, v: not has_qn(v), {int})):
formats.extend(traverse_obj(
self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, qn=qn)),
lambda _, v: not has_qn(v['quality'])))
self._check_missing_formats(play_info, formats)
flv_formats = traverse_obj(formats, lambda _, v: v['fragments'])
if flv_formats and len(flv_formats) < len(formats):
# Flv and mp4 are incompatible due to `multi_video` workaround, so drop one
if not self._configuration_arg('prefer_multi_flv'):
dropped_fmts = ', '.join(
f'{f.get("format_note")} ({f.get("format_id")})' for f in flv_formats)
formats = traverse_obj(formats, lambda _, v: not v.get('fragments'))
if dropped_fmts:
self.to_screen(
f'Dropping incompatible flv format(s) {dropped_fmts} since mp4 is available. '
'To extract flv, pass --extractor-args "bilibili:prefer_multi_flv"')
else:
formats = traverse_obj(
# XXX: Filtering by extractor-arg is for testing purposes
formats, lambda _, v: v['quality'] == int(self._configuration_arg('prefer_multi_flv')[0]),
) or [max(flv_formats, key=lambda x: x['quality'])]
if traverse_obj(formats, (0, 'fragments')):
# We have flv formats, which are individual short videos with their own timestamps and metainfo
# Binary concatenation corrupts their timestamps, so we need a `multi_video` workaround
return {
**metainfo,
'_type': 'multi_video',
'entries': [{
'id': f'{metainfo["id"]}_{idx}',
'title': metainfo['title'],
'http_headers': metainfo['http_headers'],
'formats': [{
**fragment,
'format_id': formats[0].get('format_id'),
}],
'subtitles': self.extract_subtitles(video_id, cid) if idx == 0 else None,
'__post_extractor': self.extract_comments(aid) if idx == 0 else None,
} for idx, fragment in enumerate(formats[0]['fragments'])],
'duration': float_or_none(play_info.get('timelength'), scale=1000),
}
else:
return {
**metainfo,
'formats': formats,
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'chapters': self._get_chapters(aid, cid),
'subtitles': self.extract_subtitles(video_id, cid),
'__post_extractor': self.extract_comments(aid),
}
class BiliBiliBangumiIE(BilibiliBaseIE):
@@ -837,8 +1022,6 @@ class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
class BilibiliCheeseBaseIE(BilibiliBaseIE):
_HEADERS = {'Referer': 'https://www.bilibili.com/'}
def _extract_episode(self, season_info, ep_id):
episode_info = traverse_obj(season_info, (
'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False)
@@ -968,7 +1151,7 @@ class BilibiliCheeseSeasonIE(BilibiliCheeseBaseIE):
}))
class BilibiliSpaceBaseIE(InfoExtractor):
class BilibiliSpaceBaseIE(BilibiliBaseIE):
def _extract_playlist(self, fetch_page, get_metadata, get_entries):
first_page = fetch_page(0)
metadata = get_metadata(first_page)
@@ -988,73 +1171,53 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
'id': '3985676',
},
'playlist_mincount': 178,
'skip': 'login required',
}, {
'url': 'https://space.bilibili.com/313580179/video',
'info_dict': {
'id': '313580179',
},
'playlist_mincount': 92,
'skip': 'login required',
}]
def _extract_signature(self, playlist_id):
session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False)
key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0]
img_key = traverse_obj(
session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100'
sub_key = traverse_obj(
session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6'
session_key = img_key + sub_key
signature_values = []
for position in (
46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39,
12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63,
57, 62, 11, 36, 20, 34, 44, 52,
):
char_at_position = try_call(lambda: session_key[position])
if char_at_position:
signature_values.append(char_at_position)
return ''.join(signature_values)[:32]
def _real_extract(self, url):
playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video')
if not is_video_url:
self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. '
'To download audios, add a "/audio" to the URL')
signature = self._extract_signature(playlist_id)
def fetch_page(page_idx):
query = {
'keyword': '',
'mid': playlist_id,
'order': 'pubdate',
'order': traverse_obj(parse_qs(url), ('order', 0)) or 'pubdate',
'order_avoided': 'true',
'platform': 'web',
'pn': page_idx + 1,
'ps': 30,
'tid': 0,
'web_location': 1550101,
'wts': int(time.time()),
}
query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest()
try:
response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search',
playlist_id, note=f'Downloading page {page_idx}', query=query,
headers={'referer': url})
response = self._download_json(
'https://api.bilibili.com/x/space/wbi/arc/search', playlist_id,
query=self._sign_wbi(query, playlist_id),
note=f'Downloading space page {page_idx}', headers={'Referer': url})
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 412:
raise ExtractorError(
'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
raise
if response['code'] in (-352, -401):
status_code = response['code']
if status_code == -401:
raise ExtractorError(
f'Request is blocked by server ({-response["code"]}), '
'please add cookies, wait and try later.', expected=True)
'Request is blocked by server (401), please add cookies, wait and try later.', expected=True)
elif status_code == -352 and not self.is_logged_in:
self.raise_login_required('Request is rejected, you need to login to access playlist')
elif status_code != 0:
raise ExtractorError(f'Request failed ({status_code}): {response.get("message") or "Unknown error"}')
return response['data']
def get_metadata(page_data):
@@ -1280,7 +1443,10 @@ class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.bilibili.com/watchlater/#/list',
'info_dict': {'id': 'watchlater'},
'info_dict': {
'id': r're:\d+',
'title': '稍后再看',
},
'playlist_mincount': 0,
'skip': 'login required',
}]
@@ -1356,14 +1522,19 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/watchlater',
'info_dict': {'id': 'watchlater'},
'info_dict': {
'id': r're:2_\d+',
'title': '稍后再看',
'uploader': str,
'uploader_id': str,
},
'playlist_mincount': 0,
'skip': 'login required',
}, {
'url': 'https://www.bilibili.com/medialist/play/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
'skip': 'redirect url & login required',
}]
def _extract_medialist(self, query, list_id):
@@ -1414,7 +1585,7 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
'title': ('title', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'timestamp': ('ctime', {int_or_none}, filter),
'thumbnail': ('cover', {url_or_none}),
})),
}
@@ -1680,7 +1851,7 @@ class BiliBiliPlayerIE(InfoExtractor):
class BiliIntlBaseIE(InfoExtractor):
_API_URL = 'https://api.bilibili.tv/intl/gateway'
_NETRC_MACHINE = 'biliintl'
_HEADERS = {'Referer': 'https://www.bilibili.com/'}
_HEADERS = {'Referer': 'https://www.bilibili.tv/'}
def _call_api(self, endpoint, *args, **kwargs):
json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
@@ -1808,7 +1979,8 @@ class BiliIntlBaseIE(InfoExtractor):
public_key = Cryptodome.RSA.importKey(key_data['key'])
password_hash = Cryptodome.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode())
login_post = self._download_json(
'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({
'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None,
data=urlencode_postdata({
'username': username,
'password': base64.b64encode(password_hash).decode('ascii'),
'keep_me': 'true',
@@ -2140,7 +2312,8 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
def _real_extract(self, url):
series_id = self._match_id(url)
series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {}
series_info = self._call_api(
f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {}
return self.playlist_result(
self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'),
categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none),

View File

@@ -24,7 +24,7 @@ from ..utils import (
class BitChuteIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
_EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
@@ -91,6 +91,9 @@ class BitChuteIE(InfoExtractor):
}, {
'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
'only_matching': True,
}, {
'url': 'https://old.bitchute.com/video/UGlrF9o9b-Q/',
'only_matching': True,
}]
_GEO_BYPASS = False
@@ -132,7 +135,7 @@ class BitChuteIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)
f'https://old.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)
self._raise_if_restricted(webpage)
publish_date = clean_html(get_element_by_class('video-publish-date', webpage))
@@ -171,13 +174,13 @@ class BitChuteIE(InfoExtractor):
class BitChuteChannelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bitchute.com/channel/bitchute/',
'info_dict': {
'id': 'bitchute',
'title': 'BitChute',
'description': 'md5:5329fb3866125afa9446835594a9b138',
'description': 'md5:2134c37d64fc3a4846787c402956adac',
},
'playlist': [
{
@@ -210,6 +213,9 @@ class BitChuteChannelIE(InfoExtractor):
'title': 'Bruce MacDonald and "The Light of Darkness"',
'description': 'md5:747724ef404eebdfc04277714f81863e',
},
}, {
'url': 'https://old.bitchute.com/playlist/wV9Imujxasw9/',
'only_matching': True,
}]
_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
@@ -230,7 +236,7 @@ class BitChuteChannelIE(InfoExtractor):
@staticmethod
def _make_url(playlist_id, playlist_type):
return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/'
return f'https://old.bitchute.com/{playlist_type}/{playlist_id}/'
def _fetch_page(self, playlist_id, playlist_type, page_num):
playlist_url = self._make_url(playlist_id, playlist_type)

388
yt_dlp/extractor/bluesky.py Normal file
View File

@@ -0,0 +1,388 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
format_field,
int_or_none,
mimetype2ext,
orderedSet,
parse_iso8601,
truncate_string,
update_url_query,
url_basename,
url_or_none,
variadic,
)
from ..utils.traversal import traverse_obj
class BlueskyIE(InfoExtractor):
_VALID_URL = [
r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P<handle>[\w.:%-]+)/post/(?P<id>\w+)',
r'at://(?P<handle>[\w.:%-]+)/app\.bsky\.feed\.post/(?P<id>\w+)',
]
_TESTS = [{
'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g',
'md5': '375539c1930ab05d15585ed772ab54fd',
'info_dict': {
'id': '3l4omssdl632g',
'ext': 'mp4',
'uploader': 'Blu3Blu3Lilith',
'uploader_id': 'blu3blue.bsky.social',
'uploader_url': 'https://bsky.app/profile/blu3blue.bsky.social',
'channel_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2',
'channel_url': 'https://bsky.app/profile/did:plc:pzdr5ylumf7vmvwasrpr5bf2',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'OMG WE HAVE VIDEOS NOW',
'description': 'OMG WE HAVE VIDEOS NOW',
'upload_date': '20240921',
'timestamp': 1726940605,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g',
'md5': 'b9e344fdbce9f2852c668a97efefb105',
'info_dict': {
'id': '3l3vgf77uco2g',
'ext': 'mp4',
'uploader': 'Bluesky',
'uploader_id': 'bsky.app',
'uploader_url': 'https://bsky.app/profile/bsky.app',
'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky now has video! Update your app to versi...',
'alt_title': 'Bluesky video feature announcement',
'description': r're:(?s)Bluesky now has video! .{239}',
'upload_date': '20240911',
'timestamp': 1726074716,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
'subtitles': {
'en': 'mincount:1',
},
},
}, {
'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c',
'md5': '5f2df8c200b5633eb7fb2c984d29772f',
'info_dict': {
'id': '3l4qhp7bcs52c',
'ext': 'mp4',
'uploader': 'souris',
'uploader_id': 'souris.moe',
'uploader_url': 'https://bsky.app/profile/souris.moe',
'channel_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp',
'channel_url': 'https://bsky.app/profile/did:plc:tj7g244gl5v6ai6cm4f4wlqp',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky video #3l4qhp7bcs52c',
'upload_date': '20240922',
'timestamp': 1727003838,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e',
'md5': '1af9c7fda061cf7593bbffca89e43d1c',
'info_dict': {
'id': '3l3w4tnezek2e',
'ext': 'mp4',
'uploader': 'clean',
'uploader_id': 'de1.pds.tentacle.expert',
'uploader_url': 'https://bsky.app/profile/de1.pds.tentacle.expert',
'channel_id': 'did:web:de1.tentacle.expert',
'channel_url': 'https://bsky.app/profile/did:web:de1.tentacle.expert',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky video #3l3w4tnezek2e',
'upload_date': '20240911',
'timestamp': 1726098823,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o',
'info_dict': {
'id': 'XxK3t_5V3ao',
'ext': 'mp4',
'uploader': 'yunayu',
'uploader_id': '@yunayuispink',
'uploader_url': 'https://www.youtube.com/@yunayuispink',
'channel': 'yunayu',
'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w',
'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w',
'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp',
'description': r're:Have a good goodx10000day',
'title': '5min vs 5hours drawing',
'availability': 'public',
'live_status': 'not_live',
'playable_in_embed': True,
'upload_date': '20241026',
'timestamp': 1729967784,
'duration': 321,
'age_limit': 0,
'like_count': int,
'view_count': int,
'comment_count': int,
'channel_follower_count': int,
'categories': ['Entertainment'],
'tags': [],
},
'add_ie': ['Youtube'],
}, {
'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m',
'info_dict': {
'id': '222792849',
'ext': 'mp3',
'uploader': 'LASERBAT',
'uploader_id': 'laserbatx',
'uploader_url': 'https://laserbatx.bandcamp.com',
'artists': ['LASERBAT'],
'album_artists': ['LASERBAT'],
'album': 'Hari Nezumi [EP]',
'track': 'Forward to the End',
'title': 'LASERBAT - Forward to the End',
'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg',
'duration': 228.571,
'track_id': '222792849',
'release_date': '20230423',
'upload_date': '20230423',
'timestamp': 1682276040.0,
'release_timestamp': 1682276040.0,
'track_number': 1,
},
'add_ie': ['Bandcamp'],
}, {
'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j',
'md5': 'b9e344fdbce9f2852c668a97efefb105',
'info_dict': {
'id': '3l3vgf77uco2g',
'ext': 'mp4',
'uploader': 'Bluesky',
'uploader_id': 'bsky.app',
'uploader_url': 'https://bsky.app/profile/bsky.app',
'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky now has video! Update your app to versi...',
'alt_title': 'Bluesky video feature announcement',
'description': r're:(?s)Bluesky now has video! .{239}',
'upload_date': '20240911',
'timestamp': 1726074716,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
'subtitles': {
'en': 'mincount:1',
},
},
}, {
'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f',
'md5': '8775118b235cf9fa6b5ad30f95cda75c',
'info_dict': {
'id': '3l7rdfxhyds2f',
'ext': 'mp4',
'uploader': 'cinnamon',
'uploader_id': 'alt.bun.how',
'uploader_url': 'https://bsky.app/profile/alt.bun.how',
'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide',
'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'crazy that i look like this tbh',
'description': 'crazy that i look like this tbh',
'upload_date': '20241030',
'timestamp': 1730332128,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': ['sexual'],
'age_limit': 18,
},
}, {
'url': 'at://did:plc:ia76kvnndjutgedggx2ibrem/app.bsky.feed.post/3l6zrz6zyl2dr',
'md5': '71b0eb6d85d03145e6af6642c7fc6d78',
'info_dict': {
'id': '3l6zrz6zyl2dr',
'ext': 'mp4',
'uploader': 'mary🐇',
'uploader_id': 'mary.my.id',
'uploader_url': 'https://bsky.app/profile/mary.my.id',
'channel_id': 'did:plc:ia76kvnndjutgedggx2ibrem',
'channel_url': 'https://bsky.app/profile/did:plc:ia76kvnndjutgedggx2ibrem',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky video #3l6zrz6zyl2dr',
'upload_date': '20241021',
'timestamp': 1729523172,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/purpleicetea.bsky.social/post/3l7gv55dc2o2w',
'info_dict': {
'id': '3l7gv55dc2o2w',
},
'playlist': [{
'info_dict': {
'id': '3l7gv55dc2o2w',
'ext': 'mp4',
'upload_date': '20241026',
'description': 'One of my favorite videos',
'comment_count': int,
'uploader_url': 'https://bsky.app/profile/purpleicetea.bsky.social',
'uploader': 'Purple.Ice.Tea',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'channel_url': 'https://bsky.app/profile/did:plc:bjh5ffwya5f53dfy47dezuwx',
'like_count': int,
'channel_id': 'did:plc:bjh5ffwya5f53dfy47dezuwx',
'repost_count': int,
'timestamp': 1729973202,
'tags': [],
'uploader_id': 'purpleicetea.bsky.social',
'title': 'One of my favorite videos',
},
}, {
'info_dict': {
'id': '3l77u64l7le2e',
'ext': 'mp4',
'title': 'hearing people on twitter say that bluesky isn\'...',
'like_count': int,
'uploader_id': 'thafnine.net',
'uploader_url': 'https://bsky.app/profile/thafnine.net',
'upload_date': '20241024',
'channel_url': 'https://bsky.app/profile/did:plc:6ttyq36rhiyed7wu3ws7dmqj',
'description': r're:(?s)hearing people on twitter say that bluesky .{93}',
'tags': [],
'alt_title': 'md5:9b1ee1937fb3d1a81e932f9ec14d560e',
'uploader': 'T9',
'channel_id': 'did:plc:6ttyq36rhiyed7wu3ws7dmqj',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'timestamp': 1729731642,
'comment_count': int,
'repost_count': int,
},
}],
}]
_BLOB_URL_TMPL = '{}/xrpc/com.atproto.sync.getBlob'
def _get_service_endpoint(self, did, video_id):
if did.startswith('did:web:'):
url = f'https://{did[8:]}/.well-known/did.json'
else:
url = f'https://plc.directory/{did}'
services = self._download_json(
url, video_id, 'Fetching service endpoint', 'Falling back to bsky.social', fatal=False)
return traverse_obj(
services, ('service', lambda _, x: x['type'] == 'AtprotoPersonalDataServer',
'serviceEndpoint', {url_or_none}, any)) or 'https://bsky.social'
def _real_extract(self, url):
handle, video_id = self._match_valid_url(url).group('handle', 'id')
post = self._download_json(
'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
video_id, query={
'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
'depth': 0,
'parentHeight': 0,
})['thread']['post']
entries = []
# app.bsky.embed.video.view/app.bsky.embed.external.view
entries.extend(self._extract_videos(post, video_id))
# app.bsky.embed.recordWithMedia.view
entries.extend(self._extract_videos(
post, video_id, embed_path=('embed', 'media'), record_subpath=('embed', 'media')))
# app.bsky.embed.record.view
if nested_post := traverse_obj(post, ('embed', 'record', ('record', None), {dict}, any)):
entries.extend(self._extract_videos(
nested_post, video_id, embed_path=('embeds', 0), record_path='value'))
if not entries:
raise ExtractorError('No video could be found in this post', expected=True)
if len(entries) == 1:
return entries[0]
return self.playlist_result(entries, video_id)
@staticmethod
def _build_profile_url(path):
return format_field(path, None, 'https://bsky.app/profile/%s', default=None)
def _extract_videos(self, root, video_id, embed_path='embed', record_path='record', record_subpath='embed'):
embed_path = variadic(embed_path, (str, bytes, dict, set))
record_path = variadic(record_path, (str, bytes, dict, set))
record_subpath = variadic(record_subpath, (str, bytes, dict, set))
entries = []
if external_uri := traverse_obj(root, (
((*record_path, *record_subpath), embed_path), 'external', 'uri', {url_or_none}, any)):
entries.append(self.url_result(external_uri))
if playlist := traverse_obj(root, (*embed_path, 'playlist', {url_or_none})):
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
playlist, video_id, 'mp4', m3u8_id='hls', fatal=False)
else:
return entries
video_cid = traverse_obj(
root, (*embed_path, 'cid', {str}),
(*record_path, *record_subpath, 'video', 'ref', '$link', {str}))
did = traverse_obj(root, ('author', 'did', {str}))
if did and video_cid:
endpoint = self._get_service_endpoint(did, video_id)
formats.append({
'format_id': 'blob',
'url': update_url_query(
self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': video_cid}),
**traverse_obj(root, (*embed_path, 'aspectRatio', {
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
})),
**traverse_obj(root, (*record_path, *record_subpath, 'video', {
'filesize': ('size', {int_or_none}),
'ext': ('mimeType', {mimetype2ext}),
})),
})
for sub_data in traverse_obj(root, (
*record_path, *record_subpath, 'captions', lambda _, v: v['file']['ref']['$link'])):
subtitles.setdefault(sub_data.get('lang') or 'und', []).append({
'url': update_url_query(
self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': sub_data['file']['ref']['$link']}),
'ext': traverse_obj(sub_data, ('file', 'mimeType', {mimetype2ext})),
})
entries.append({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(root, {
'id': ('uri', {url_basename}),
'thumbnail': (*embed_path, 'thumbnail', {url_or_none}),
'alt_title': (*embed_path, 'alt', {str}, filter),
'uploader': ('author', 'displayName', {str}),
'uploader_id': ('author', 'handle', {str}),
'uploader_url': ('author', 'handle', {self._build_profile_url}),
'channel_id': ('author', 'did', {str}),
'channel_url': ('author', 'did', {self._build_profile_url}),
'like_count': ('likeCount', {int_or_none}),
'repost_count': ('repostCount', {int_or_none}),
'comment_count': ('replyCount', {int_or_none}),
'timestamp': ('indexedAt', {parse_iso8601}),
'tags': ('labels', ..., 'val', {str}, all, {orderedSet}),
'age_limit': (
'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any),
'description': (*record_path, 'text', {str}, filter),
'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}),
}),
})
return entries

View File

@@ -12,7 +12,7 @@ from ..utils.traversal import traverse_obj
class BoxIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?'
_VALID_URL = r'https?://(?:[^.]+\.)?(?P<service>app|ent)\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?'
_TESTS = [{
'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538',
'md5': '1f81b2fd3960f38a40a3b8823e5fcd43',
@@ -38,10 +38,22 @@ class BoxIE(InfoExtractor):
'uploader_id': '239068974',
},
'params': {'skip_download': 'dash fragment too small'},
}, {
'url': 'https://thejacksonlaboratory.ent.box.com/s/2x09dm6vcg6y28o0oox1so4l0t8wzt6l/file/1536173056065',
'info_dict': {
'id': '1536173056065',
'ext': 'mp4',
'uploader_id': '18523128264',
'uploader': 'Lexi Hennigan',
'title': 'iPSC Symposium recording part 1.mp4',
'timestamp': 1716228343,
'upload_date': '20240520',
},
'params': {'skip_download': 'dash fragment too small'},
}]
def _real_extract(self, url):
shared_name, file_id = self._match_valid_url(url).groups()
shared_name, file_id, service = self._match_valid_url(url).group('shared_name', 'id', 'service')
webpage = self._download_webpage(url, file_id or shared_name)
if not file_id:
@@ -57,14 +69,14 @@ class BoxIE(InfoExtractor):
request_token = self._search_json(
r'Box\.config\s*=', webpage, 'Box config', file_id)['requestToken']
access_token = self._download_json(
'https://app.box.com/app-api/enduserapp/elements/tokens', file_id,
f'https://{service}.box.com/app-api/enduserapp/elements/tokens', file_id,
'Downloading token JSON metadata',
data=json.dumps({'fileIDs': [file_id]}).encode(), headers={
'Content-Type': 'application/json',
'X-Request-Token': request_token,
'X-Box-EndUser-API': 'sharedName=' + shared_name,
})[file_id]['read']
shared_link = 'https://app.box.com/s/' + shared_name
shared_link = f'https://{service}.box.com/s/{shared_name}'
f = self._download_json(
'https://api.box.com/2.0/files/' + file_id, file_id,
'Downloading file JSON metadata', headers={

View File

@@ -1,35 +1,20 @@
import functools
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_text_and_html_by_tag,
get_elements_by_class,
join_nonempty,
js_to_json,
mimetype2ext,
unified_strdate,
url_or_none,
urljoin,
variadic,
)
from ..utils.traversal import traverse_obj
def html_get_element(tag=None, cls=None):
assert tag or cls, 'One of tag or class is required'
if cls:
func = functools.partial(get_elements_by_class, cls, tag=tag)
else:
func = functools.partial(get_element_text_and_html_by_tag, tag)
def html_get_element_wrapper(html):
return variadic(func(html))[0]
return html_get_element_wrapper
from ..utils.traversal import (
find_element,
traverse_obj,
)
class BpbIE(InfoExtractor):
@@ -41,12 +26,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '297',
'ext': 'mp4',
'creator': 'Kooperative Berlin',
'description': 'md5:f4f75885ba009d3e2b156247a8941ce6',
'release_date': '20160115',
'creators': ['Kooperative Berlin'],
'description': r're:Joachim Gauck, .*\n\nKamera: .*',
'release_date': '20150716',
'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D',
'tags': [],
'thumbnail': r're:https?://www\.bpb\.de/cache/images/7/297_teaser_16x9_1240\.jpg.*',
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -55,11 +40,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '522184',
'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
'release_date': '20230621',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB',
'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
'tags': [],
'thumbnail': r're:https://www\.bpb\.de/cache/images/4/522184_teaser_16x9_1240\.png.*',
'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -68,11 +54,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '518789',
'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
'release_date': '20230302',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D',
'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
'tags': [],
'thumbnail': r're:https://www\.bpb\.de/cache/images/9/518789_teaser_16x9_1240\.jpeg.*',
'title': 'md5:3e956f264bb501f6383f10495a401da4',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -84,12 +71,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '315813',
'ext': 'mp3',
'creator': 'Axel Schröder',
'creators': ['Axel Schröder'],
'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
'release_date': '20200921',
'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94',
'thumbnail': r're:https://www\.bpb\.de/cache/images/3/315813_teaser_16x9_1240\.png.*',
'title': 'Folge 1: Eine Einführung',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -98,12 +85,12 @@ class BpbIE(InfoExtractor):
'info_dict': {
'id': '517806',
'ext': 'mp3',
'creator': 'Bundeszentrale für politische Bildung',
'creators': ['Bundeszentrale für politische Bildung'],
'description': 'md5:594689600e919912aade0b2871cc3fed',
'release_date': '20230127',
'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0',
'thumbnail': r're:https://www\.bpb\.de/cache/images/6/517806_teaser_16x9_1240\.png.*',
'title': 'Die Weltanschauung der "Neuen Rechten"',
'uploader': 'Bundeszentrale für politische Bildung',
},
@@ -147,7 +134,7 @@ class BpbIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
title_result = traverse_obj(webpage, ({find_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))
return {
@@ -156,15 +143,15 @@ class BpbIE(InfoExtractor):
# This metadata could be interpreted otherwise, but it fits "series" the most
'series': traverse_obj(title_result, ('series', {str.strip})) or None,
'description': join_nonempty(*traverse_obj(webpage, [(
{html_get_element(cls='opening-intro')},
[{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}],
{find_element(cls='opening-intro')},
[{find_element(tag='bpb-accordion-item')}, {find_element(cls='text-content')}],
), {clean_html}]), delim='\n\n') or None,
'creator': self._html_search_meta('author', webpage),
'creators': traverse_obj(self._html_search_meta('author', webpage), all),
'uploader': self._html_search_meta('publisher', webpage),
'release_date': unified_strdate(self._html_search_meta('date', webpage)),
'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),
**traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), {
'formats': (':sources', ..., {self._process_source}),
'thumbnail': ('poster', {lambda x: urljoin(url, x)}),
'thumbnail': ('poster', {urljoin(url)}),
}),
}

View File

@@ -145,10 +145,9 @@ class BravoTVIE(AdobePassIE):
tp_metadata = self._download_json(
update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False)
seconds_or_none = lambda x: float_or_none(x, 1000)
chapters = traverse_obj(tp_metadata, ('chapters', ..., {
'start_time': ('startTime', {seconds_or_none}),
'end_time': ('endTime', {seconds_or_none}),
'start_time': ('startTime', {float_or_none(scale=1000)}),
'end_time': ('endTime', {float_or_none(scale=1000)}),
}))
# prune pointless single chapters that span the entire duration from short videos
if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')):
@@ -168,8 +167,8 @@ class BravoTVIE(AdobePassIE):
**merge_dicts(traverse_obj(tp_metadata, {
'title': 'title',
'description': 'description',
'duration': ('duration', {seconds_or_none}),
'timestamp': ('pubDate', {seconds_or_none}),
'duration': ('duration', {float_or_none(scale=1000)}),
'timestamp': ('pubDate', {float_or_none(scale=1000)}),
'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}),
'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}),
'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}),

View File

@@ -18,6 +18,7 @@ from ..utils import (
fix_xml_ampersands,
float_or_none,
int_or_none,
join_nonempty,
js_to_json,
mimetype2ext,
parse_iso8601,
@@ -386,7 +387,7 @@ class BrightcoveLegacyIE(InfoExtractor):
@classmethod
def _make_brightcove_url(cls, params):
return update_url_query(
'http://c.brightcove.com/services/viewer/htmlFederated', params)
'https://c.brightcove.com/services/viewer/htmlFederated', params)
@classmethod
def _extract_brightcove_url(cls, webpage):
@@ -470,7 +471,7 @@ class BrightcoveLegacyIE(InfoExtractor):
if referer:
headers['Referer'] = referer
player_page = self._download_webpage(
'http://link.brightcove.com/services/player/bcpid' + player_id[0],
'https://link.brightcove.com/services/player/bcpid' + player_id[0],
video_id, headers=headers, fatal=False)
if player_page:
player_key = self._search_regex(
@@ -480,7 +481,7 @@ class BrightcoveLegacyIE(InfoExtractor):
enc_pub_id = player_key.split(',')[1].replace('~', '=')
publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0]
if publisher_id:
brightcove_new_url = f'http://players.brightcove.net/{publisher_id}/default_default/index.html?videoId={video_id}'
brightcove_new_url = f'https://players.brightcove.net/{publisher_id}/default_default/index.html?videoId={video_id}'
if referer:
brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer})
return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id)
@@ -538,12 +539,7 @@ class BrightcoveNewBaseIE(AdobePassIE):
})
def build_format_id(kind):
format_id = kind
if tbr:
format_id += f'-{int(tbr)}k'
if height:
format_id += f'-{height}p'
return format_id
return join_nonempty(kind, tbr and f'{int(tbr)}k', height and f'{height}p')
if src or streaming_src:
f.update({
@@ -801,7 +797,7 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
# Look for iframe embeds [1]
for _, url in re.findall(
r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
entries.append(url if url.startswith('http') else 'http:' + url)
entries.append(url if url.startswith(('http:', 'https:')) else 'https:' + url)
# Look for <video> tags [2] and embed_in_page embeds [3]
# [2] looks like:
@@ -830,7 +826,7 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
player_id = player_id or attrs.get('data-player') or 'default'
embed = embed or attrs.get('data-embed') or 'default'
bc_url = f'http://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}'
bc_url = f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}'
# Some brightcove videos may be embedded with video tag only and
# without script tag or any mentioning of brightcove at all. Such
@@ -867,7 +863,7 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
store_pk = lambda x: self.cache.store('brightcove', policy_key_id, x)
def extract_policy_key():
base_url = f'http://players.brightcove.net/{account_id}/{player_id}_{embed}/'
base_url = f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/'
config = self._download_json(
base_url + 'config.json', video_id, fatal=False) or {}
policy_key = try_get(

View File

@@ -8,11 +8,13 @@ from ..utils import (
bug_reports_message,
clean_html,
format_field,
get_element_text_and_html_by_tag,
int_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
from ..utils.traversal import (
find_element,
traverse_obj,
)
class BundestagIE(InfoExtractor):
@@ -115,9 +117,8 @@ class BundestagIE(InfoExtractor):
note='Downloading metadata overlay', fatal=False,
), {
'title': (
{functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0,
{functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
{find_element(tag='h3')}, {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
'description': ({find_element(tag='p')}, {clean_html}),
}))
return result

View File

@@ -53,7 +53,7 @@ class CaffeineTVIE(InfoExtractor):
'like_count': ('like_count', {int_or_none}),
'view_count': ('view_count', {int_or_none}),
'comment_count': ('comment_count', {int_or_none}),
'tags': ('tags', ..., {str}, {lambda x: x or None}),
'tags': ('tags', ..., {str}, filter),
'uploader': ('user', 'name', {str}),
'uploader_id': (((None, 'user'), 'username'), {str}, any),
'is_live': ('is_live', {bool}),
@@ -62,7 +62,7 @@ class CaffeineTVIE(InfoExtractor):
'title': ('broadcast_title', {str}),
'duration': ('content_duration', {int_or_none}),
'timestamp': ('broadcast_start_time', {parse_iso8601}),
'thumbnail': ('preview_image_path', {lambda x: urljoin(url, x)}),
'thumbnail': ('preview_image_path', {urljoin(url)}),
}),
'age_limit': {
# assume Apple Store ratings: https://en.wikipedia.org/wiki/Mobile_software_content_rating_system

View File

@@ -3,7 +3,7 @@ from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj
class CallinIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)'
_VALID_URL = r'https?://(?:www\.)?callin\.com/episode/(?P<id>[-a-zA-Z]+)'
_TESTS = [{
'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
'info_dict': {

View File

@@ -1,22 +1,28 @@
import base64
import functools
import json
import re
import time
import urllib.parse
import xml.etree.ElementTree
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
join_nonempty,
js_to_json,
mimetype2ext,
orderedSet,
parse_iso8601,
replace_extension,
smuggle_url,
strip_or_none,
traverse_obj,
try_get,
update_url,
url_basename,
url_or_none,
)
@@ -149,6 +155,7 @@ class CBCIE(InfoExtractor):
class CBCPlayerIE(InfoExtractor):
IE_NAME = 'cbc.ca:player'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_GEO_COUNTRIES = ['CA']
_TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193',
'md5': '64d25f841ddf4ddb28a235338af32e2c',
@@ -172,21 +179,20 @@ class CBCPlayerIE(InfoExtractor):
'description': 'md5:dd3b692f0a139b0369943150bd1c46a9',
'timestamp': 1425704400,
'upload_date': '20150307',
'uploader': 'CBCC-NEW',
'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg',
'chapters': [],
'duration': 494.811,
'categories': ['AudioMobile/All in a Weekend Montreal'],
'tags': 'count:8',
'categories': ['All in a Weekend Montreal'],
'tags': 'count:11',
'location': 'Quebec',
'series': 'All in a Weekend Montreal',
'season': 'Season 2015',
'season_number': 2015,
'media_type': 'Excerpt',
'genres': ['Other'],
},
}, {
'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062',
'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
'ext': 'mp4',
@@ -194,107 +200,168 @@ class CBCPlayerIE(InfoExtractor):
'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
'timestamp': 1320410746,
'upload_date': '20111104',
'uploader': 'CBCC-NEW',
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg',
'chapters': [],
'duration': 186.867,
'series': 'CBC News: Windsor at 6:00',
'categories': ['News/Canada/Windsor'],
'categories': ['Windsor'],
'location': 'Windsor',
'tags': ['cancer'],
'creators': ['Allison Johnson'],
'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'],
'media_type': 'Excerpt',
'genres': ['News'],
},
'params': {'skip_download': 'm3u8'},
}, {
# Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
'url': 'https://www.cbc.ca/player/play/1.2985700',
'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
'info_dict': {
'id': '2657631896',
'id': '1.2985700',
'ext': 'mp3',
'title': 'CBC Montreal is organizing its first ever community hackathon!',
'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.',
'timestamp': 1425704400,
'upload_date': '20150307',
'uploader': 'CBCC-NEW',
'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg',
'chapters': [],
'duration': 494.811,
'categories': ['AudioMobile/All in a Weekend Montreal'],
'tags': 'count:8',
'categories': ['All in a Weekend Montreal'],
'tags': 'count:11',
'location': 'Quebec',
'series': 'All in a Weekend Montreal',
'season': 'Season 2015',
'season_number': 2015,
'media_type': 'Excerpt',
'genres': ['Other'],
},
}, {
'url': 'https://www.cbc.ca/player/play/1.1711287',
'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
'id': '1.1711287',
'ext': 'mp4',
'title': 'Cancer survivor four times over',
'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
'timestamp': 1320410746,
'upload_date': '20111104',
'uploader': 'CBCC-NEW',
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg',
'chapters': [],
'duration': 186.867,
'series': 'CBC News: Windsor at 6:00',
'categories': ['News/Canada/Windsor'],
'categories': ['Windsor'],
'location': 'Windsor',
'tags': ['cancer'],
'creators': ['Allison Johnson'],
'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'],
'media_type': 'Excerpt',
'genres': ['News'],
},
'params': {'skip_download': 'm3u8'},
}, {
# Has subtitles
# These broadcasts expire after ~1 month, can find new test URL here:
# https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
'url': 'https://www.cbc.ca/player/play/1.7159484',
'md5': '6ed6cd0fc2ef568d2297ba68a763d455',
'url': 'https://www.cbc.ca/player/play/video/9.6424403',
'md5': '8025909eaffcf0adf59922904def9a5e',
'info_dict': {
'id': '2324213316001',
'id': '9.6424403',
'ext': 'mp4',
'title': 'The National | School boards sue social media giants',
'description': 'md5:4b4db69322fa32186c3ce426da07402c',
'timestamp': 1711681200,
'duration': 2743.400,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg',
'uploader': 'CBCC-NEW',
'title': 'The National | N.W.T. wildfire emergency',
'description': 'md5:ada33d36d1df69347ed575905bfd496c',
'timestamp': 1718589600,
'duration': 2692.833,
'subtitles': {
'en-US': [{
'name': 'English Captions',
'url': 'https://cbchls.akamaized.net/delivery/news-shows/2024/06/17/NAT_JUN16-00-55-00/NAT_JUN16_cc.vtt',
}],
},
'thumbnail': 'https://i.cbc.ca/ais/6272b5c6-5e78-4c05-915d-0e36672e33d1,1714756287822/full/max/0/default.jpg',
'chapters': 'count:5',
'upload_date': '20240329',
'categories': 'count:4',
'upload_date': '20240617',
'categories': ['News', 'The National', 'The National Latest Broadcasts'],
'series': 'The National - Full Show',
'tags': 'count:1',
'creators': ['News'],
'tags': ['The National'],
'location': 'Canada',
'media_type': 'Full Program',
'genres': ['News'],
},
}, {
'url': 'https://www.cbc.ca/player/play/video/1.7194274',
'md5': '188b96cf6bdcb2540e178a6caa957128',
'info_dict': {
'id': '2334524995812',
'id': '1.7194274',
'ext': 'mp4',
'title': '#TheMoment a rare white spirit moose was spotted in Alberta',
'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3',
'timestamp': 1714788791,
'duration': 77.678,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg',
'uploader': 'CBCC-NEW',
'chapters': 'count:0',
'upload_date': '20240504',
'thumbnail': 'https://i.cbc.ca/ais/1.7194274,1717224990425/full/max/0/default.jpg',
'chapters': [],
'categories': 'count:3',
'series': 'The National',
'tags': 'count:15',
'creators': ['encoder'],
'tags': 'count:17',
'location': 'Canada',
'media_type': 'Excerpt',
'upload_date': '20240504',
'genres': ['News'],
},
}, {
'url': 'https://www.cbc.ca/player/play/video/9.6427282',
'info_dict': {
'id': '9.6427282',
'ext': 'mp4',
'title': 'Men\'s Soccer - Argentina vs Morocco',
'description': 'Argentina faces Morocco on the football pitch at Saint Etienne Stadium.',
'series': 'CBC Sports',
'media_type': 'Event Coverage',
'thumbnail': 'https://i.cbc.ca/ais/a4c5c0c2-99fa-4bd3-8061-5a63879c1b33,1718828053500/full/max/0/default.jpg',
'timestamp': 1721825400.0,
'upload_date': '20240724',
'duration': 10568.0,
'chapters': [],
'genres': [],
'tags': ['2024 Paris Olympic Games'],
'categories': ['Olympics Summer Soccer', 'Summer Olympics Replays', 'Summer Olympics Soccer Replays'],
'location': 'Canada',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.cbc.ca/player/play/video/9.6459530',
'md5': '6c1bb76693ab321a2e99c347a1d5ecbc',
'info_dict': {
'id': '9.6459530',
'ext': 'mp4',
'title': 'Parts of Jasper incinerated as wildfire rages',
'description': 'md5:6f1caa8d128ad3f629257ef5fecf0962',
'series': 'The National',
'media_type': 'Excerpt',
'thumbnail': 'https://i.cbc.ca/ais/507c0086-31a2-494d-96e4-bffb1048d045,1721953984375/full/max/0/default.jpg',
'timestamp': 1721964091.012,
'upload_date': '20240726',
'duration': 952.285,
'chapters': [],
'genres': [],
'tags': 'count:23',
'categories': ['News (FAST)', 'News', 'The National', 'TV News Shows', 'The National '],
},
}, {
'url': 'https://www.cbc.ca/player/play/video/9.6420651',
'md5': '71a850c2c6ee5e912de169f5311bb533',
'info_dict': {
'id': '9.6420651',
'ext': 'mp4',
'title': 'Is it a breath of fresh air? Measuring air quality in Edmonton',
'description': 'md5:3922b92cc8b69212d739bd9dd095b1c3',
'series': 'CBC News Edmonton',
'media_type': 'Excerpt',
'thumbnail': 'https://i.cbc.ca/ais/73c4ab9c-7ad4-46ee-bb9b-020fdc01c745,1718214547576/full/max/0/default.jpg',
'timestamp': 1718220065.768,
'upload_date': '20240612',
'duration': 286.086,
'chapters': [],
'genres': ['News'],
'categories': ['News', 'Edmonton'],
'tags': 'count:7',
'location': 'Edmonton',
},
}, {
'url': 'cbcplayer:1.7159484',
@@ -307,23 +374,113 @@ class CBCPlayerIE(InfoExtractor):
'only_matching': True,
}]
def _parse_param(self, asset_data, name):
return traverse_obj(asset_data, ('params', lambda _, v: v['name'] == name, 'value', {str}, any))
def _real_extract(self, url):
video_id = self._match_id(url)
if '.' in video_id:
webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id)
video_id = self._search_json(
r'window\.__INITIAL_STATE__\s*=', webpage,
'initial state', video_id)['video']['currentClip']['mediaId']
webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id)
data = self._search_json(
r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)['video']['currentClip']
assets = traverse_obj(
data, ('media', 'assets', lambda _, v: url_or_none(v['key']) and v['type']))
if not assets and (media_id := traverse_obj(data, ('mediaId', {str}))):
# XXX: Deprecated; CBC is migrating off of ThePlatform
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': smuggle_url(
f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{media_id}?mbr=true&formats=MPEG4,FLV,MP3', {
'force_smil_url': True,
}),
'id': media_id,
'_format_sort_fields': ('res', 'proto'), # Prioritize direct http formats over HLS
}
is_live = traverse_obj(data, ('media', 'streamType', {str})) == 'Live'
formats, subtitles = [], {}
for sub in traverse_obj(data, ('media', 'textTracks', lambda _, v: url_or_none(v['src']))):
subtitles.setdefault(sub.get('language') or 'und', []).append({
'url': sub['src'],
'name': sub.get('label'),
})
for asset in assets:
asset_key = asset['key']
asset_type = asset['type']
if asset_type != 'medianet':
self.report_warning(f'Skipping unsupported asset type "{asset_type}": {asset_key}')
continue
asset_data = self._download_json(asset_key, video_id, f'Downloading {asset_type} JSON')
ext = mimetype2ext(self._parse_param(asset_data, 'contentType'))
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
asset_data['url'], video_id, 'mp4', m3u8_id='hls', live=is_live)
formats.extend(fmts)
# Avoid slow/error-prone webvtt-over-m3u8 if direct https vtt is available
if not subtitles:
self._merge_subtitles(subs, target=subtitles)
if is_live or not fmts:
continue
# Check for direct https mp4 format
best_video_fmt = traverse_obj(fmts, (
lambda _, v: v.get('vcodec') != 'none' and v['tbr'], all,
{functools.partial(sorted, key=lambda x: x['tbr'])}, -1, {dict})) or {}
base_url = self._search_regex(
r'(https?://[^?#]+?/)hdntl=', best_video_fmt.get('url'), 'base url', default=None)
if not base_url or '/live/' in base_url:
continue
mp4_url = base_url + replace_extension(url_basename(best_video_fmt['url']), 'mp4')
if self._request_webpage(
HEADRequest(mp4_url), video_id, 'Checking for https format',
errnote=False, fatal=False):
formats.append({
**best_video_fmt,
'url': mp4_url,
'format_id': 'https-mp4',
'protocol': 'https',
'manifest_url': None,
'acodec': None,
})
else:
formats.append({
'url': asset_data['url'],
'ext': ext,
'vcodec': 'none' if self._parse_param(asset_data, 'mediaType') == 'audio' else None,
})
chapters = traverse_obj(data, (
'media', 'chapters', lambda _, v: float(v['startTime']) is not None, {
'start_time': ('startTime', {float_or_none(scale=1000)}),
'end_time': ('endTime', {float_or_none(scale=1000)}),
'title': ('name', {str}),
}))
# Filter out pointless single chapters with start_time==0 and no end_time
if len(chapters) == 1 and not (chapters[0].get('start_time') or chapters[0].get('end_time')):
chapters = []
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': smuggle_url(
f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{video_id}?mbr=true&formats=MPEG4,FLV,MP3', {
'force_smil_url': True,
}),
**traverse_obj(data, {
'title': ('title', {str}),
'description': ('description', {str.strip}),
'thumbnail': ('image', 'url', {url_or_none}, {update_url(query=None)}),
'timestamp': ('publishedAt', {float_or_none(scale=1000)}),
'media_type': ('media', 'clipType', {str}),
'series': ('showName', {str}),
'season_number': ('media', 'season', {int_or_none}),
'duration': ('media', 'duration', {float_or_none}, {lambda x: None if is_live else x}),
'location': ('media', 'region', {str}),
'tags': ('tags', ..., 'name', {str}),
'genres': ('media', 'genre', all),
'categories': ('categories', ..., 'name', {str}),
}),
'id': video_id,
'_format_sort_fields': ('res', 'proto'), # Prioritize direct http formats over HLS
'formats': formats,
'subtitles': subtitles,
'chapters': chapters,
'is_live': is_live,
}
@@ -365,14 +522,13 @@ class CBCGemIE(InfoExtractor):
_TESTS = [{
# This is a normal, public, TV show video
'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01',
'md5': '93dbb31c74a8e45b378cf13bd3f6f11e',
'info_dict': {
'id': 'schitts-creek/s06e01',
'ext': 'mp4',
'title': 'Smoke Signals',
'description': 'md5:929868d20021c924020641769eb3e7f1',
'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_06e01_thumbnail_v01.jpg?im=Resize=(Size)',
'duration': 1314,
'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_06e01_thumbnail_v01\.jpg',
'duration': 1324,
'categories': ['comedy'],
'series': 'Schitt\'s Creek',
'season': 'Season 6',
@@ -380,19 +536,21 @@ class CBCGemIE(InfoExtractor):
'episode': 'Smoke Signals',
'episode_number': 1,
'episode_id': 'schitts-creek/s06e01',
'upload_date': '20210618',
'timestamp': 1623988800,
'release_date': '20200107',
'release_timestamp': 1578427200,
},
'params': {'format': 'bv'},
'skip': 'Geo-restricted to Canada',
}, {
# This video requires an account in the browser, but works fine in yt-dlp
'url': 'https://gem.cbc.ca/media/schitts-creek/s01e01',
'md5': '297a9600f554f2258aed01514226a697',
'info_dict': {
'id': 'schitts-creek/s01e01',
'ext': 'mp4',
'title': 'The Cup Runneth Over',
'description': 'md5:9bca14ea49ab808097530eb05a29e797',
'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_01e01_thumbnail_v01.jpg?im=Resize=(Size)',
'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_01e01_thumbnail_v01\.jpg',
'series': 'Schitt\'s Creek',
'season_number': 1,
'season': 'Season 1',
@@ -401,9 +559,12 @@ class CBCGemIE(InfoExtractor):
'episode_id': 'schitts-creek/s01e01',
'duration': 1309,
'categories': ['comedy'],
'upload_date': '20210617',
'timestamp': 1623902400,
'release_date': '20151124',
'release_timestamp': 1448323200,
},
'params': {'format': 'bv'},
'skip': 'Geo-restricted to Canada',
}, {
'url': 'https://gem.cbc.ca/nadiyas-family-favourites/s01e01',
'only_matching': True,
@@ -455,10 +616,8 @@ class CBCGemIE(InfoExtractor):
def claims_token_expired(self):
exp = self._get_claims_token_expiry()
if exp - time.time() < 10:
# It will expire in less than 10 seconds, or has already expired
return True
return False
# It will expire in less than 10 seconds, or has already expired
return exp - time.time() < 10
def claims_token_valid(self):
return self._claims_token is not None and not self.claims_token_expired()
@@ -474,38 +633,6 @@ class CBCGemIE(InfoExtractor):
return
self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token')
def _find_secret_formats(self, formats, video_id):
""" Find a valid video url and convert it to the secret variant """
base_format = next((f for f in formats if f.get('vcodec') != 'none'), None)
if not base_format:
return
base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url'])
url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url)
secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False)
if not isinstance(secret_xml, xml.etree.ElementTree.Element):
return
for child in secret_xml:
if child.attrib.get('Type') != 'video':
continue
for video_quality in child:
bitrate = int_or_none(video_quality.attrib.get('Bitrate'))
if not bitrate or 'Index' not in video_quality.attrib:
continue
height = int_or_none(video_quality.attrib.get('MaxHeight'))
yield {
**base_format,
'format_id': join_nonempty('sec', height),
# Note: \g<1> is necessary instead of \1 since bitrate is a number
'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url),
'width': int_or_none(video_quality.attrib.get('MaxWidth')),
'tbr': bitrate / 1000.0,
'height': height,
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_info = self._download_json(
@@ -519,7 +646,6 @@ class CBCGemIE(InfoExtractor):
else:
headers = {}
m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers)
m3u8_url = m3u8_info.get('url')
if m3u8_info.get('errorCode') == 1:
self.raise_geo_restricted(countries=['CA'])
@@ -528,9 +654,9 @@ class CBCGemIE(InfoExtractor):
elif m3u8_info.get('errorCode') != 0:
raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}')
formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
formats = self._extract_m3u8_formats(
m3u8_info['url'], video_id, 'mp4', m3u8_id='hls', query={'manifestType': ''})
self._remove_duplicate_formats(formats)
formats.extend(self._find_secret_formats(formats, video_id))
for fmt in formats:
if fmt.get('vcodec') == 'none':
@@ -546,20 +672,21 @@ class CBCGemIE(InfoExtractor):
return {
'id': video_id,
'title': video_info['title'],
'description': video_info.get('description'),
'thumbnail': video_info.get('image'),
'series': video_info.get('series'),
'season_number': video_info.get('season'),
'season': f'Season {video_info.get("season")}',
'episode_number': video_info.get('episode'),
'episode': video_info.get('title'),
'episode_id': video_id,
'duration': video_info.get('duration'),
'categories': [video_info.get('category')],
'formats': formats,
'release_timestamp': video_info.get('airDate'),
'timestamp': video_info.get('availableDate'),
**traverse_obj(video_info, {
'title': ('title', {str}),
'episode': ('title', {str}),
'description': ('description', {str}),
'thumbnail': ('image', {url_or_none}),
'series': ('series', {str}),
'season_number': ('season', {int_or_none}),
'episode_number': ('episode', {int_or_none}),
'duration': ('duration', {int_or_none}),
'categories': ('category', {str}, all),
'release_timestamp': ('airDate', {int_or_none(scale=1000)}),
'timestamp': ('availableDate', {int_or_none(scale=1000)}),
}),
}
@@ -649,11 +776,11 @@ class CBCGemLiveIE(InfoExtractor):
'title': 'Ottawa',
'description': 'The live TV channel and local programming from Ottawa',
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg',
'is_live': True,
'live_status': 'is_live',
'id': 'AyqZwxRqh8EH',
'ext': 'mp4',
'timestamp': 1492106160,
'upload_date': '20170413',
'release_timestamp': 1492106160,
'release_date': '20170413',
'uploader': 'CBCC-NEW',
},
'skip': 'Live might have ended',
@@ -682,49 +809,84 @@ class CBCGemLiveIE(InfoExtractor):
'description': 'March 24, 2023 | President Bidens Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.',
'live_status': 'is_live',
'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*',
'timestamp': 1679706000,
'upload_date': '20230325',
'release_timestamp': 1679706000,
'release_date': '20230325',
},
'params': {'skip_download': True},
'skip': 'Live might have ended',
},
{ # event replay (medianetlive)
'url': 'https://gem.cbc.ca/live-event/42314',
'md5': '297a9600f554f2258aed01514226a697',
'info_dict': {
'id': '42314',
'ext': 'mp4',
'live_status': 'was_live',
'title': 'Women\'s Soccer - Canada vs New Zealand',
'description': 'md5:36200e5f1a70982277b5a6ecea86155d',
'thumbnail': r're:https://.+default\.jpg',
'release_timestamp': 1721917200,
'release_date': '20240725',
},
'params': {'skip_download': True},
'skip': 'Replay might no longer be available',
},
{ # event replay (medianetlive)
'url': 'https://gem.cbc.ca/live-event/43273',
'only_matching': True,
},
]
_GEO_COUNTRIES = ['CA']
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data']
# Two types of metadata JSON
# Three types of video_info JSON: info in root, freeTv stream/item, event replay
if not video_info.get('formattedIdMedia'):
video_info = traverse_obj(
video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}),
get_all=False, default={})
if traverse_obj(video_info, ('event', 'key')) == video_id:
video_info = video_info['event']
else:
video_info = traverse_obj(video_info, (
('freeTv', ('streams', ...)), 'items',
lambda _, v: v['key'].partition('-')[0] == video_id, any)) or {}
video_stream_id = video_info.get('formattedIdMedia')
if not video_stream_id:
raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True)
raise ExtractorError(
'Couldn\'t find video metadata, maybe this livestream is now offline', expected=True)
stream_data = self._download_json(
'https://services.radio-canada.ca/media/validation/v2/', video_id, query={
'appCode': 'mpx',
'connectionType': 'hd',
'deviceType': 'ipad',
'idMedia': video_stream_id,
'multibitrate': 'true',
'output': 'json',
'tech': 'hls',
'manifestType': 'desktop',
})
live_status = 'was_live' if video_info.get('isVodEnabled') else 'is_live'
release_timestamp = traverse_obj(video_info, ('airDate', {parse_iso8601}))
if live_status == 'is_live' and release_timestamp and release_timestamp > time.time():
formats = []
live_status = 'is_upcoming'
self.raise_no_formats('This livestream has not yet started', expected=True)
else:
stream_data = self._download_json(
'https://services.radio-canada.ca/media/validation/v2/', video_id, query={
'appCode': 'medianetlive',
'connectionType': 'hd',
'deviceType': 'ipad',
'idMedia': video_stream_id,
'multibitrate': 'true',
'output': 'json',
'tech': 'hls',
'manifestType': 'desktop',
})
formats = self._extract_m3u8_formats(
stream_data['url'], video_id, 'mp4', live=live_status == 'is_live')
return {
'id': video_id,
'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True),
'is_live': True,
'formats': formats,
'live_status': live_status,
'release_timestamp': release_timestamp,
**traverse_obj(video_info, {
'title': 'title',
'description': 'description',
'title': ('title', {str}),
'description': ('description', {str}),
'thumbnail': ('images', 'card', 'url'),
'timestamp': ('airDate', {parse_iso8601}),
}),
}

View File

@@ -1,6 +1,5 @@
import base64
import re
import urllib.error
import urllib.parse
import zlib
@@ -97,7 +96,7 @@ class CBSNewsBaseIE(InfoExtractor):
**traverse_obj(item, {
'title': (None, ('fulltitle', 'title')),
'description': 'dek',
'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}),
'timestamp': ('timestamp', {float_or_none(scale=1000)}),
'duration': ('duration', {float_or_none}),
'subtitles': ('captions', {get_subtitles}),
'thumbnail': ('images', ('hd', 'sd'), {url_or_none}),

View File

@@ -12,53 +12,86 @@ from ..utils import (
class CCMAIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?P<type>video|audio)/(?P<id>\d+)'
IE_DESC = '3Cat, TV3 and Catalunya Ràdio'
_VALID_URL = r'https?://(?:www\.)?3cat\.cat/(?:3cat|tv3/sx3)/[^/?#]+/(?P<type>video|audio)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/',
# ccma.cat/tv3/alacarta/ URLs redirect to 3cat.cat/3cat/
'url': 'https://www.3cat.cat/3cat/lespot-de-la-marato-de-tv3/video/5630208/',
'md5': '7296ca43977c8ea4469e719c609b0871',
'info_dict': {
'id': '5630208',
'ext': 'mp4',
'title': 'L\'espot de La Marató de TV3',
'title': 'L\'espot de La Marató 2016: Ictus i les lesions medul·lars i cerebrals traumàtiques',
'description': 'md5:f12987f320e2f6e988e9908e4fe97765',
'timestamp': 1478608140,
'upload_date': '20161108',
'age_limit': 0,
'alt_title': 'EsportMarató2016WEB_PerPublicar',
'duration': 79,
'thumbnail': 'https://img.3cat.cat/multimedia/jpg/4/6/1478536106664.jpg',
'series': 'Dedicada a l\'ictus i les lesions medul·lars i cerebrals traumàtiques',
'categories': ['Divulgació'],
},
}, {
'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/',
# ccma.cat/catradio/alacarta/ URLs redirect to 3cat.cat/3cat/
'url': 'https://www.3cat.cat/3cat/el-consell-de-savis-analitza-el-derbi/audio/943685/',
'md5': 'fa3e38f269329a278271276330261425',
'info_dict': {
'id': '943685',
'ext': 'mp3',
'title': 'El Consell de Savis analitza el derbi',
'description': 'md5:e2a3648145f3241cb9c6b4b624033e53',
'upload_date': '20170512',
'timestamp': 1494622500,
'upload_date': '20161217',
'timestamp': 1482011700,
'vcodec': 'none',
'categories': ['Esports'],
'series': 'Tot gira',
'duration': 821,
'thumbnail': 'https://img.3cat.cat/multimedia/jpg/8/9/1482002602598.jpg',
},
}, {
'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/',
'md5': 'b43c3d3486f430f3032b5b160d80cbc3',
'url': 'https://www.3cat.cat/3cat/crims-josep-tallada-lespereu-me-part-1/video/6031387/',
'md5': '27493513d08a3e5605814aee9bb778d2',
'info_dict': {
'id': '6031387',
'ext': 'mp4',
'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)',
'title': 'T1xC5 - Josep Talleda, l\'"Espereu-me" (part 1)',
'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60',
'timestamp': 1582577700,
'timestamp': 1582577919,
'upload_date': '20200224',
'subtitles': 'mincount:4',
'age_limit': 16,
'subtitles': 'mincount:1',
'age_limit': 13,
'series': 'Crims',
'thumbnail': 'https://img.3cat.cat/multimedia/jpg/1/9/1582564376991.jpg',
'duration': 3203,
'categories': ['Divulgació'],
'alt_title': 'Crims - 5 - Josep Talleda, l\'"Espereu-me" (1a part) - Josep Talleda, l\'"Espereu-me" (part 1)',
'episode_number': 5,
'episode': 'Episode 5',
},
}, {
'url': 'https://www.3cat.cat/tv3/sx3/una-mosca-volava-per-la-llum/video/5759227/',
'info_dict': {
'id': '5759227',
'ext': 'mp4',
'title': 'Una mosca volava per la llum',
'alt_title': '17Z004Ç UNA MOSCA VOLAVA PER LA LLUM',
'description': 'md5:9ab64276944b0825336f4147f13f7854',
'series': 'Mic',
'upload_date': '20180411',
'timestamp': 1523440105,
'duration': 160,
'age_limit': 0,
'thumbnail': 'https://img.3cat.cat/multimedia/jpg/6/1/1524071667216.jpg',
'categories': ['Música'],
},
}]
def _real_extract(self, url):
media_type, media_id = self._match_valid_url(url).groups()
media_type, media_id = self._match_valid_url(url).group('type', 'id')
media = self._download_json(
'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={
'http://api-media.3cat.cat/pvideo/media.jsp', media_id, query={
'media': media_type,
'idint': media_id,
'format': 'dm',

View File

@@ -12,6 +12,7 @@ from .common import InfoExtractor
from ..compat import compat_ord
from ..utils import (
ExtractorError,
OnDemandPagedList,
float_or_none,
int_or_none,
merge_dicts,
@@ -351,3 +352,50 @@ class CDAIE(InfoExtractor):
extract_format(webpage, resolution)
return merge_dicts(info_dict, info)
class CDAFolderIE(InfoExtractor):
_MAX_PAGE_SIZE = 36
_VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P<channel>\w+)/folder/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://www.cda.pl/domino264/folder/31188385',
'info_dict': {
'id': '31188385',
'title': 'SERIA DRUGA',
},
'playlist_mincount': 13,
},
{
'url': 'https://www.cda.pl/smiechawaTV/folder/2664592/vfilm',
'info_dict': {
'id': '2664592',
'title': 'VideoDowcipy - wszystkie odcinki',
},
'playlist_mincount': 71,
},
{
'url': 'https://www.cda.pl/DeliciousBeauty/folder/19129979/vfilm',
'info_dict': {
'id': '19129979',
'title': 'TESTY KOSMETYKÓW',
},
'playlist_mincount': 139,
}]
def _real_extract(self, url):
folder_id, channel = self._match_valid_url(url).group('id', 'channel')
webpage = self._download_webpage(url, folder_id)
def extract_page_entries(page):
webpage = self._download_webpage(
f'https://www.cda.pl/{channel}/folder/{folder_id}/vfilm/{page + 1}', folder_id,
f'Downloading page {page + 1}', expected_status=404)
items = re.findall(r'<a[^>]+href="/video/([0-9a-z]+)"', webpage)
for video_id in items:
yield self.url_result(f'https://www.cda.pl/video/{video_id}', CDAIE, video_id)
return self.playlist_result(
OnDemandPagedList(extract_page_entries, self._MAX_PAGE_SIZE),
folder_id, self._og_search_title(webpage))

View File

@@ -1,63 +1,50 @@
from .common import InfoExtractor
from ..utils import traverse_obj
from .vidyard import VidyardBaseIE, VidyardIE
from ..utils import ExtractorError, make_archive_id, url_basename
class CellebriteIE(InfoExtractor):
class CellebriteIE(VidyardBaseIE):
_VALID_URL = r'https?://cellebrite\.com/(?:\w+)?/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://cellebrite.com/en/collect-data-from-android-devices-with-cellebrite-ufed/',
'info_dict': {
'id': '16025876',
'id': 'ZqmUss3dQfEMGpauambPuH',
'display_id': '16025876',
'ext': 'mp4',
'description': 'md5:174571cb97083fd1d457d75c684f4e2b',
'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png',
'title': 'Ask the Expert: Chat Capture - Collect Data from Android Devices in Cellebrite UFED',
'duration': 455,
'tags': [],
'description': 'md5:dee48fe12bbae5c01fe6a053f7676da4',
'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png',
'duration': 455.979,
'_old_archive_ids': ['cellebrite 16025876'],
},
}, {
'url': 'https://cellebrite.com/en/how-to-lawfully-collect-the-maximum-amount-of-data-from-android-devices/',
'info_dict': {
'id': '29018255',
'id': 'QV1U8a2yzcxigw7VFnqKyg',
'display_id': '29018255',
'ext': 'mp4',
'duration': 134,
'tags': [],
'description': 'md5:e9a3d124c7287b0b07bad2547061cacf',
'title': 'How to Lawfully Collect the Maximum Amount of Data From Android Devices',
'description': 'md5:0e943a9ac14c374d5d74faed634d773c',
'thumbnail': 'https://cellebrite.com/wp-content/uploads/2022/07/How-to-Lawfully-Collect-the-Maximum-Amount-of-Data-From-Android-Devices.png',
'title': 'Android Extractions Explained',
'duration': 134.315,
'_old_archive_ids': ['cellebrite 29018255'],
},
}]
def _get_formats_and_subtitles(self, json_data, display_id):
formats = [{'url': url} for url in traverse_obj(json_data, ('mp4', ..., 'url')) or []]
subtitles = {}
for url in traverse_obj(json_data, ('hls', ..., 'url')) or []:
fmt, sub = self._extract_m3u8_formats_and_subtitles(
url, display_id, ext='mp4', headers={'Referer': 'https://play.vidyard.com/'})
formats.extend(fmt)
self._merge_subtitles(sub, target=subtitles)
return formats, subtitles
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
slug = self._match_id(url)
webpage = self._download_webpage(url, slug)
vidyard_url = next(VidyardIE._extract_embed_urls(url, webpage), None)
if not vidyard_url:
raise ExtractorError('No Vidyard video embeds found on page')
player_uuid = self._search_regex(
r'<img\s[^>]*\bdata-uuid\s*=\s*"([^"\?]+)', webpage, 'player UUID')
json_data = self._download_json(
f'https://play.vidyard.com/player/{player_uuid}.json', display_id)['payload']['chapters'][0]
video_id = url_basename(vidyard_url)
info = self._process_video_json(self._fetch_video_json(video_id)['chapters'][0], video_id)
if info.get('display_id'):
info['_old_archive_ids'] = [make_archive_id(self, info['display_id'])]
if thumbnail := self._og_search_thumbnail(webpage, default=None):
info.setdefault('thumbnails', []).append({'url': thumbnail})
formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], display_id)
return {
'id': str(json_data['videoId']),
'title': json_data.get('name') or self._og_search_title(webpage),
'formats': formats,
'subtitles': subtitles,
'description': json_data.get('description') or self._og_search_description(webpage),
'duration': json_data.get('seconds'),
'tags': json_data.get('tags'),
'thumbnail': self._og_search_thumbnail(webpage),
'http_headers': {'Referer': 'https://play.vidyard.com/'},
'description': self._og_search_description(webpage, default=None),
**info,
}

View File

@@ -1,5 +1,3 @@
import functools
from .common import InfoExtractor
from ..utils import (
UserNotLive,
@@ -36,7 +34,7 @@ class CHZZKLiveIE(InfoExtractor):
def _real_extract(self, url):
channel_id = self._match_id(url)
live_detail = self._download_json(
f'https://api.chzzk.naver.com/service/v2/channels/{channel_id}/live-detail', channel_id,
f'https://api.chzzk.naver.com/service/v3/channels/{channel_id}/live-detail', channel_id,
note='Downloading channel info', errnote='Unable to download channel info')['content']
if live_detail.get('status') == 'CLOSE':
@@ -77,7 +75,7 @@ class CHZZKLiveIE(InfoExtractor):
'thumbnails': thumbnails,
**traverse_obj(live_detail, {
'title': ('liveTitle', {str}),
'timestamp': ('openDate', {functools.partial(parse_iso8601, delimiter=' ')}),
'timestamp': ('openDate', {parse_iso8601(delimiter=' ')}),
'concurrent_view_count': ('concurrentUserCount', {int_or_none}),
'view_count': ('accumulateCount', {int_or_none}),
'channel': ('channel', 'channelName', {str}),
@@ -106,30 +104,77 @@ class CHZZKVideoIE(InfoExtractor):
'upload_date': '20231219',
'view_count': int,
},
'skip': 'Replay video is expired',
}, {
# Manually uploaded video
'url': 'https://chzzk.naver.com/video/1980',
'info_dict': {
'id': '1980',
'ext': 'mp4',
'title': '※시청주의※한번보면 잊기 힘든 영상',
'channel': '라디유radiyu',
'channel_id': '68f895c59a1043bc5019b5e08c83a5c5',
'channel_is_verified': False,
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 95,
'timestamp': 1703102631.722,
'upload_date': '20231220',
'view_count': int,
},
}, {
# Partner channel replay video
'url': 'https://chzzk.naver.com/video/2458',
'info_dict': {
'id': '2458',
'ext': 'mp4',
'title': '첫 방송',
'channel': '강지',
'channel_id': 'b5ed5db484d04faf4d150aedd362f34b',
'channel_is_verified': True,
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 4433,
'timestamp': 1703307460.214,
'upload_date': '20231223',
'view_count': int,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_meta = self._download_json(
f'https://api.chzzk.naver.com/service/v2/videos/{video_id}', video_id,
f'https://api.chzzk.naver.com/service/v3/videos/{video_id}', video_id,
note='Downloading video info', errnote='Unable to download video info')['content']
formats, subtitles = self._extract_mpd_formats_and_subtitles(
f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id,
query={
'key': video_meta['inKey'],
'env': 'real',
'lc': 'en_US',
'cpl': 'en_US',
}, note='Downloading video playback', errnote='Unable to download video playback')
live_status = 'was_live' if video_meta.get('liveOpenDate') else 'not_live'
video_status = video_meta.get('vodStatus')
if video_status == 'UPLOAD':
playback = self._parse_json(video_meta['liveRewindPlaybackJson'], video_id)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
playback['media'][0]['path'], video_id, 'mp4', m3u8_id='hls')
elif video_status == 'ABR_HLS':
formats, subtitles = self._extract_mpd_formats_and_subtitles(
f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}',
video_id, query={
'key': video_meta['inKey'],
'env': 'real',
'lc': 'en_US',
'cpl': 'en_US',
})
else:
self.raise_no_formats(
f'Unknown video status detected: "{video_status}"', expected=True, video_id=video_id)
formats, subtitles = [], {}
live_status = 'post_live' if live_status == 'was_live' else None
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'live_status': live_status,
**traverse_obj(video_meta, {
'title': ('videoTitle', {str}),
'thumbnail': ('thumbnailImageUrl', {url_or_none}),
'timestamp': ('publishDateAt', {functools.partial(float_or_none, scale=1000)}),
'timestamp': ('publishDateAt', {float_or_none(scale=1000)}),
'view_count': ('readCount', {int_or_none}),
'duration': ('duration', {int_or_none}),
'channel': ('channel', 'channelName', {str}),

View File

@@ -3,6 +3,7 @@ import re
from .common import InfoExtractor
from ..utils import (
filter_dict,
float_or_none,
int_or_none,
parse_age_limit,
smuggle_url,
@@ -85,7 +86,7 @@ class CineverseIE(CineverseBaseIE):
'title': 'title',
'id': ('details', 'item_id'),
'description': ('details', 'description'),
'duration': ('duration', {lambda x: x / 1000}),
'duration': ('duration', {float_or_none(scale=1000)}),
'cast': ('details', 'cast', {lambda x: x.split(', ')}),
'modified_timestamp': ('details', 'updated_by', 0, 'update_time', 'time', {int_or_none}),
'season_number': ('details', 'season', {int_or_none}),

View File

@@ -6,11 +6,11 @@ from .common import InfoExtractor
class CloudflareStreamIE(InfoExtractor):
_SUBDOMAIN_RE = r'(?:(?:watch|iframe|customer-\w+)\.)?'
_DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)'
_EMBED_RE = rf'embed\.{_DOMAIN_RE}/embed/[^/]+\.js\?.*?\bvideo='
_ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+'
_EMBED_RE = rf'(?:embed\.|{_SUBDOMAIN_RE}){_DOMAIN_RE}/embed/[^/?#]+\.js\?(?:[^#]+&)?video='
_ID_RE = r'[\da-f]{32}|eyJ[\w-]+\.[\w-]+\.[\w-]+'
_VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P<id>{_ID_RE})'
_EMBED_REGEX = [
rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1',
rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE})(?:(?!\1).)*)\1',
rf'<iframe[^>]+\bsrc=["\'](?P<url>https?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})',
]
_TESTS = [{
@@ -24,6 +24,14 @@ class CloudflareStreamIE(InfoExtractor):
'params': {
'skip_download': 'm3u8',
},
}, {
'url': 'https://watch.cloudflarestream.com/embed/sdk-iframe-integration.fla9.latest.js?video=0e8e040aec776862e1d632a699edf59e',
'info_dict': {
'id': '0e8e040aec776862e1d632a699edf59e',
'ext': 'mp4',
'title': '0e8e040aec776862e1d632a699edf59e',
'thumbnail': 'https://videodelivery.net/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg',
},
}, {
'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1',
'only_matching': True,
@@ -36,6 +44,9 @@ class CloudflareStreamIE(InfoExtractor):
}, {
'url': 'https://customer-aw5py76sw8wyqzmh.cloudflarestream.com/2463f6d3e06fa29710a337f5f5389fd8/iframe',
'only_matching': True,
}, {
'url': 'https://watch.cloudflarestream.com/eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJraWQiOiJmYTA0YjViMzQ2NDkwYTM5NWJiNzQ1NWFhZTA2YzYwZSIsInN1YiI6Ijg4ZDQxMDhhMzY0MjA3M2VhYmFhZjg3ZGExODJkMjYzIiwiZXhwIjoxNjAwNjA5MzE5fQ.xkRJwLGkt0nZ%5F0BlPiwU7iW4pqb4lKkznbKfAhGg0tGcxSS6ZBA3lcTUwu7W%2DyCFbnAl%2Dhqk3Fn%5FqeQS%5FQydP27qTHpB9iIFFsMtk1tqzGZV5v4yrYDnwLSKzEKvVd6QwJnfABtxH2JdpSNuWlMUiVXFxGWgjOw6QeTNDDklTQYXV%5FNLV7sErSn5CeOPeRRkdXb%2D8ip%5FVOcfk1nDsFoOo4fctFtGP0wYMyY5ae8nhhatydHwevuvJCcEvEfh%2D4qjq9mCZOodevmtSQ4YWmggf4BxtWnDWYrGW8Otp6oqezrR8oY4%2DbKdV6PaqBj49aJdcls6xK7PmM8%5Fvjy3xfm0Mg',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
'url': 'https://upride.cc/incident/shoulder-pass-at-light/',

View File

@@ -1,3 +1,5 @@
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -35,6 +37,20 @@ class CloudyCDNIE(InfoExtractor):
'duration': 1205,
'upload_date': '20221130',
},
}, {
# Video-only m3u8 formats need manual fixup
'url': 'https://embed.cloudycdn.services/ltv/media/08j_d24-6000-074',
'md5': 'fc472e40f6e6238446509be411c920e2',
'info_dict': {
'id': '08j_d24-6000-074',
'ext': 'mp4',
'upload_date': '20240620',
'duration': 1673,
'title': 'D24-6000-074-cetstud',
'timestamp': 1718902233,
'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/788392/placeholder1718903938.jpg',
},
'params': {'format': 'bv'},
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.tavaklase.lv/video/es-esmu-mina-um-2/',
@@ -63,6 +79,9 @@ class CloudyCDNIE(InfoExtractor):
formats, subtitles = [], {}
for m3u8_url in traverse_obj(data, ('source', 'sources', ..., 'src', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False)
for fmt in fmts:
if re.search(r'chunklist_b\d+_vo_', fmt['url']):
fmt['acodec'] = 'none'
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)

View File

@@ -1,146 +1,225 @@
import json
import re
from .common import InfoExtractor
from .turner import TurnerBaseIE
from ..utils import merge_dicts, try_call, url_basename
from ..utils import (
clean_html,
extract_attributes,
int_or_none,
merge_dicts,
parse_duration,
parse_iso8601,
parse_resolution,
try_call,
update_url,
url_or_none,
)
from ..utils.traversal import find_elements, traverse_obj
class CNNIE(TurnerBaseIE):
_VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/
(?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
class CNNIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:edition|www|money|cnnespanol)\.)?cnn\.com/(?!audio/)(?P<display_id>[^?#]+?)(?:[?#]|$|/index\.html)'
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
'md5': '3e6121ea48df7e2259fe73a0628605c4',
'url': 'https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl',
'info_dict': {
'id': 'sports/2013/06/09/nadal-1-on-1.cnn',
'id': 'med0e97ad0d154f56e29aa96e57192a14226734b6b',
'display_id': '2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl',
'ext': 'mp4',
'title': 'Nadal wins 8th French Open title',
'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
'duration': 135,
'upload_date': '20130609',
'upload_date': '20240531',
'description': 'md5:844bcdb0629e1877a7a466c913f4c19c',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/gettyimages-2151936122.jpg?c=original',
'duration': 373.0,
'timestamp': 1717148586,
'title': 'Borussia Dortmund star Jadon Sancho seeks Wembley redemption after 2020 Euros hurt',
'modified_date': '20240531',
'modified_timestamp': 1717150140,
},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29',
'md5': 'b5cc60c60a3477d185af8f19a2a26f4e',
'url': 'https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid',
'info_dict': {
'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
'id': 'me522945c4709b299e5cb8657900a7a21ad3b559f9',
'display_id': '2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid',
'ext': 'mp4',
'title': "Student's epic speech stuns new freshmen",
'description': 'A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from "2001: A Space Odyssey."',
'upload_date': '20130821',
'description': 'md5:e0120fe5da9ad8259fd707c1cbb64a60',
'title': 'Heres how some inmates in closely divided state are now able to vote from jail',
'timestamp': 1718158269,
'upload_date': '20240612',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701554-13565-571-still.jpg?c=original',
'duration': 202.0,
'modified_date': '20240612',
'modified_timestamp': 1718158509,
},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html',
'md5': 'f14d02ebd264df951feb2400e2c25a1b',
'url': 'https://edition.cnn.com/2024/06/11/style/king-charles-portrait-vandalized/index.html',
'info_dict': {
'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln',
'id': 'mef5f52b9e1fe28b1ad192afcbc9206ae984894b68',
'display_id': '2024/06/11/style/king-charles-portrait-vandalized',
'ext': 'mp4',
'title': 'Nashville Ep. 1: Hand crafted skateboards',
'description': 'md5:e7223a503315c9f150acac52e76de086',
'upload_date': '20141222',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701257-8846-816-still.jpg?c=original',
'description': 'md5:19f78338ccec533db0fa8a4511012dae',
'title': 'Video shows King Charles\' portrait being vandalized by activists',
'timestamp': 1718113852,
'upload_date': '20240611',
'duration': 51.0,
'modified_timestamp': 1718116193,
'modified_date': '20240611',
},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html',
'md5': '52a515dc1b0f001cd82e4ceda32be9d1',
'url': 'https://edition.cnn.com/videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln',
'info_dict': {
'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney',
'id': 'mefba13799201b084ea3b1d0f7ca820ae94d4bb5b2',
'display_id': 'videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln',
'ext': 'mp4',
'title': '5 stunning stats about Netflix',
'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.',
'upload_date': '20160819',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/221205163510-robin-meade-sign-off.jpg?c=original',
'duration': 158.0,
'title': 'Robin Meade signs off after HLN\'s last broadcast',
'description': 'md5:cff3c62d18d2fbc6c5c75cb029b7353b',
'upload_date': '20221205',
'timestamp': 1670284296,
'modified_timestamp': 1670332404,
'modified_date': '20221206',
},
'params': {
# m3u8 download
'skip_download': True,
'params': {'format': 'direct'},
}, {
'url': 'https://cnnespanol.cnn.com/video/ataque-misil-israel-beirut-libano-octubre-trax',
'info_dict': {
'id': 'me484a43722642aa00627b812fe928f2e99c6e2997',
'ext': 'mp4',
'display_id': 'video/ataque-misil-israel-beirut-libano-octubre-trax',
'timestamp': 1729501452,
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/ataqeubeirut-1.jpg?c=original',
'description': 'md5:256ee7137d161f776cda429654135e52',
'upload_date': '20241021',
'duration': 31.0,
'title': 'VIDEO | Israel lanza un nuevo ataque sobre Beirut',
'modified_date': '20241021',
'modified_timestamp': 1729501530,
},
}, {
'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
'only_matching': True,
}, {
'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg',
'only_matching': True,
}, {
'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn',
'only_matching': True,
'url': 'https://edition.cnn.com/2024/10/16/politics/kamala-harris-fox-news-interview/index.html',
'info_dict': {
'id': '2024/10/16/politics/kamala-harris-fox-news-interview',
},
'playlist_count': 2,
'playlist': [{
'md5': '073ffab87b8bef97c9913e71cc18ef9e',
'info_dict': {
'id': 'me19d548fdd54df0924087039283128ef473ab397d',
'ext': 'mp4',
'title': '\'I\'m not finished\': Harris interview with Fox News gets heated',
'display_id': 'kamala-harris-fox-news-interview-ebof-digvid',
'description': 'md5:e7dd3d1a04df916062230b60ca419a0a',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/harris-20241016234916617.jpg?c=original',
'duration': 173.0,
'timestamp': 1729122182,
'upload_date': '20241016',
'modified_timestamp': 1729194706,
'modified_date': '20241017',
},
'params': {'format': 'direct'},
}, {
'md5': '11604ab4af83b650826753f1ccb8ecff',
'info_dict': {
'id': 'med04507d8ca3da827001f63d22af321ec29c7d97b',
'ext': 'mp4',
'title': '\'Wise\': Buttigieg on Harris\' handling of interview question about gender transition surgery',
'display_id': 'pete-buttigieg-harris-fox-newssrc-digvid',
'description': 'md5:602a8a7e853ed5e574acd3159428c98e',
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/buttigieg-20241017040412074.jpg?c=original',
'duration': 145.0,
'timestamp': 1729137765,
'upload_date': '20241017',
'modified_timestamp': 1729138184,
'modified_date': '20241017',
},
'params': {'format': 'direct'},
}],
}]
_CONFIG = {
# http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml
'edition': {
'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml',
'media_src': 'http://pmd.cdn.turner.com/cnn/big',
},
# http://money.cnn.com/.element/apps/cvp2/cfg/config.xml
'money': {
'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml',
'media_src': 'http://ht3.cdn.turner.com/money/big',
},
}
def _extract_timestamp(self, video_data):
# TODO: fix timestamp extraction
return None
def _real_extract(self, url):
sub_domain, path, page_title = self._match_valid_url(url).groups()
if sub_domain not in ('money', 'edition'):
sub_domain = 'edition'
config = self._CONFIG[sub_domain]
return self._extract_cvp_info(
config['data_src'] % path, page_title, {
'default': {
'media_src': config['media_src'],
},
'f4m': {
'host': 'cnn-vh.akamaihd.net',
},
display_id = self._match_valid_url(url).group('display_id')
webpage = self._download_webpage(url, display_id)
app_id = traverse_obj(
self._search_json(r'window\.env\s*=', webpage, 'window env', display_id, default={}),
('TOP_AUTH_SERVICE_APP_ID', {str}))
entries = []
for player_data in traverse_obj(webpage, (
{find_elements(tag='div', attr='data-component-name', value='video-player', html=True)},
..., {extract_attributes}, all, lambda _, v: v['data-media-id'])):
media_id = player_data['data-media-id']
parent_uri = player_data.get('data-video-resource-parent-uri')
formats, subtitles = [], {}
video_data = {}
if parent_uri:
video_data = self._download_json(
'https://fave.api.cnn.io/v1/video', media_id, fatal=False,
query={
'id': media_id,
'stellarUri': parent_uri,
})
for direct_url in traverse_obj(video_data, ('files', ..., 'fileUri', {url_or_none})):
resolution, bitrate = None, None
if mobj := re.search(r'-(?P<res>\d+x\d+)_(?P<tbr>\d+)k\.mp4', direct_url):
resolution, bitrate = mobj.group('res', 'tbr')
formats.append({
'url': direct_url,
'format_id': 'direct',
'quality': 1,
'tbr': int_or_none(bitrate),
**parse_resolution(resolution),
})
for sub_data in traverse_obj(video_data, (
'closedCaptions', 'types', lambda _, v: url_or_none(v['track']['url']), 'track')):
subtitles.setdefault(sub_data.get('lang') or 'en', []).append({
'url': sub_data['url'],
'name': sub_data.get('label'),
})
if app_id:
media_data = self._download_json(
f'https://medium.ngtv.io/v2/media/{media_id}/desktop', media_id, fatal=False,
query={'appId': app_id})
m3u8_url = traverse_obj(media_data, (
'media', 'desktop', 'unprotected', 'unencrypted', 'url', {url_or_none}))
if m3u8_url:
fmts, subs = self._extract_m3u8_formats_and_subtitles(
m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
entries.append({
**traverse_obj(player_data, {
'title': ('data-headline', {clean_html}),
'description': ('data-description', {clean_html}),
'duration': ('data-duration', {parse_duration}),
'timestamp': ('data-publish-date', {parse_iso8601}),
'thumbnail': (
'data-poster-image-override', {json.loads}, 'big', 'uri', {url_or_none},
{update_url(query='c=original')}),
'display_id': 'data-video-slug',
}),
**traverse_obj(video_data, {
'timestamp': ('dateCreated', 'uts', {int_or_none(scale=1000)}),
'description': ('description', {clean_html}),
'title': ('headline', {str}),
'modified_timestamp': ('lastModified', 'uts', {int_or_none(scale=1000)}),
'duration': ('trt', {int_or_none}),
}),
'id': media_id,
'formats': formats,
'subtitles': subtitles,
})
if len(entries) == 1:
return {
**entries[0],
'display_id': display_id,
}
class CNNBlogsIE(InfoExtractor):
_VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+'
_TEST = {
'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/',
'md5': '3e56f97b0b6ffb4b79f4ea0749551084',
'info_dict': {
'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn',
'ext': 'mp4',
'title': 'Criminalizing journalism?',
'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.',
'upload_date': '20140209',
},
'expected_warnings': ['Failed to download m3u8 information'],
'add_ie': ['CNN'],
}
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
return self.url_result(cnn_url, CNNIE.ie_key())
class CNNArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
_TEST = {
'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
'info_dict': {
'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
'ext': 'mp4',
'title': 'Obama: Cyberattack not an act of war',
'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b',
'upload_date': '20141221',
},
'expected_warnings': ['Failed to download m3u8 information'],
'add_ie': ['CNN'],
}
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key())
return self.playlist_result(entries, display_id)
class CNNIndonesiaIE(InfoExtractor):

View File

@@ -35,6 +35,7 @@ from ..networking import HEADRequest, Request
from ..networking.exceptions import (
HTTPError,
IncompleteRead,
TransportError,
network_exceptions,
)
from ..networking.impersonate import ImpersonateTarget
@@ -46,6 +47,7 @@ from ..utils import (
FormatSorter,
GeoRestrictedError,
GeoUtils,
ISO639Utils,
LenientJSONDecoder,
Popen,
RegexNotFoundError,
@@ -234,7 +236,14 @@ class InfoExtractor:
'maybe' if the format may have DRM and has to be tested before download.
* extra_param_to_segment_url A query string to append to each
fragment's URL, or to update each existing query string
with. Only applied by the native HLS/DASH downloaders.
with. If it is an HLS stream with an AES-128 decryption key,
the query paramaters will be passed to the key URI as well,
unless there is an `extra_param_to_key_url` given,
or unless an external key URI is provided via `hls_aes`.
Only applied by the native HLS/DASH downloaders.
* extra_param_to_key_url A query string to append to the URL
of the format's HLS AES-128 decryption key.
Only applied by the native HLS downloader.
* hls_aes A dictionary of HLS AES-128 decryption information
used by the native HLS downloader to override the
values in the media playlist when an '#EXT-X-KEY' tag
@@ -325,7 +334,7 @@ class InfoExtractor:
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
repost_count: Number of reposts of the video
average_rating: Average rating give by users, the scale used depends on the webpage
average_rating: Average rating given by users, the scale used depends on the webpage
comment_count: Number of comments on the video
comments: A list of comments, each with one or more of the following
properties (all but one of text or html optional):
@@ -512,7 +521,7 @@ class InfoExtractor:
or _extract_from_webpage as necessary. While these are normally classmethods,
_extract_from_webpage is allowed to be an instance method.
_extract_from_webpage may raise self.StopExtraction() to stop further
_extract_from_webpage may raise self.StopExtraction to stop further
processing of the webpage and obtain exclusive rights to it. This is useful
when the extractor cannot reliably be matched using just the URL,
e.g. invidious/peertube instances
@@ -565,13 +574,13 @@ class InfoExtractor:
def _login_hint(self, method=NO_DEFAULT, netrc=None):
password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
cookies_hint = 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'
return {
None: '',
'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
'any': f'Use --cookies, --cookies-from-browser, {password_hint}. {cookies_hint}',
'password': f'Use {password_hint}',
'cookies': (
'Use --cookies-from-browser or --cookies for the authentication. '
'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
'cookies': f'Use --cookies-from-browser or --cookies for the authentication. {cookies_hint}',
'session_cookies': f'Use --cookies for the authentication (--cookies-from-browser might not work). {cookies_hint}',
}[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
def __init__(self, downloader=None):
@@ -958,6 +967,9 @@ class InfoExtractor:
return False
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
encoding=encoding, data=data)
if content is False:
assert not fatal
return False
return (content, urlh)
@staticmethod
@@ -1032,7 +1044,15 @@ class InfoExtractor:
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read()
try:
webpage_bytes = urlh.read()
except TransportError as err:
errmsg = f'{video_id}: Error reading response: {err.msg}'
if fatal:
raise ExtractorError(errmsg, cause=err)
self.report_warning(errmsg)
return False
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
if self.get_param('dump_intermediate_pages', False):
@@ -1389,6 +1409,13 @@ class InfoExtractor:
return None, None
self.write_debug(f'Using netrc for {netrc_machine} authentication')
# compat: <=py3.10: netrc cannot parse tokens as empty strings, will return `""` instead
# Ref: https://github.com/yt-dlp/yt-dlp/issues/11413
# https://github.com/python/cpython/commit/15409c720be0503131713e3d3abc1acd0da07378
if sys.version_info < (3, 11):
return tuple(x if x != '""' else '' for x in info[::2])
return info[0], info[2]
def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
@@ -1551,7 +1578,9 @@ class InfoExtractor:
if default is not NO_DEFAULT:
fatal = False
for mobj in re.finditer(JSON_LD_RE, html):
json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
json_ld_item = self._parse_json(
mobj.group('json_ld'), video_id, fatal=fatal,
errnote=False if default is not NO_DEFAULT else None)
for json_ld in variadic(json_ld_item):
if isinstance(json_ld, dict):
yield json_ld
@@ -1691,7 +1720,7 @@ class InfoExtractor:
rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
if rating is not None:
info['average_rating'] = rating
if is_type(e, 'TVEpisode', 'Episode'):
if is_type(e, 'TVEpisode', 'Episode', 'PodcastEpisode'):
episode_name = unescapeHTML(e.get('name'))
info.update({
'episode': episode_name,
@@ -2058,7 +2087,7 @@ class InfoExtractor:
has_drm = HlsFD._has_drm(m3u8_doc)
def format_url(url):
return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
return url if re.match(r'https?://', url) else urllib.parse.urljoin(m3u8_url, url)
if self.get_param('hls_split_discontinuity', False):
def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
@@ -2215,6 +2244,11 @@ class InfoExtractor:
'quality': quality,
'has_drm': has_drm,
}
# YouTube-specific
if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'):
f['language'] = yt_audio_content_id.split('.')[0]
resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
@@ -2788,11 +2822,11 @@ class InfoExtractor:
base_url_e = element.find(_add_ns('BaseURL'))
if try_call(lambda: base_url_e.text) is not None:
base_url = base_url_e.text + base_url
if re.match(r'^https?://', base_url):
if re.match(r'https?://', base_url):
break
if mpd_base_url and base_url.startswith('/'):
base_url = urllib.parse.urljoin(mpd_base_url, base_url)
elif mpd_base_url and not re.match(r'^https?://', base_url):
elif mpd_base_url and not re.match(r'https?://', base_url):
if not mpd_base_url.endswith('/'):
mpd_base_url += '/'
base_url = mpd_base_url + base_url
@@ -2882,7 +2916,7 @@ class InfoExtractor:
}
def location_key(location):
return 'url' if re.match(r'^https?://', location) else 'path'
return 'url' if re.match(r'https?://', location) else 'path'
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
@@ -3047,7 +3081,11 @@ class InfoExtractor:
url_pattern = stream.attrib['Url']
stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
stream_name = stream.get('Name')
stream_language = stream.get('Language', 'und')
# IsmFD expects ISO 639 Set 2 language codes (3-character length)
# See: https://github.com/yt-dlp/yt-dlp/issues/11356
stream_language = stream.get('Language') or 'und'
if len(stream_language) != 3:
stream_language = ISO639Utils.short2long(stream_language) or 'und'
for track in stream.findall('QualityLevel'):
KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
@@ -3138,7 +3176,7 @@ class InfoExtractor:
})
return formats, subtitles
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None, _headers=None):
def absolute_url(item_url):
return urljoin(base_url, item_url)
@@ -3162,11 +3200,11 @@ class InfoExtractor:
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
preference=preference, quality=quality, fatal=False)
preference=preference, quality=quality, fatal=False, headers=_headers)
elif ext == 'mpd':
is_plain_url = False
formats = self._extract_mpd_formats(
full_url, video_id, mpd_id=mpd_id, fatal=False)
full_url, video_id, mpd_id=mpd_id, fatal=False, headers=_headers)
else:
is_plain_url = True
formats = [{
@@ -3260,6 +3298,8 @@ class InfoExtractor:
})
for f in media_info['formats']:
f.setdefault('http_headers', {})['Referer'] = base_url
if _headers:
f['http_headers'].update(_headers)
if media_info['formats'] or media_info['subtitles']:
entries.append(media_info)
return entries
@@ -3475,7 +3515,7 @@ class InfoExtractor:
continue
urls.add(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
ext = determine_ext(source_url, default_ext=mimetype2ext(source_type))
if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',

View File

@@ -12,6 +12,7 @@ from ..utils import (
parse_iso8601,
strip_or_none,
try_get,
urljoin,
)
@@ -112,8 +113,7 @@ class CondeNastIE(InfoExtractor):
m_paths = re.finditer(
r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage)
paths = orderedSet(m.group(1) for m in m_paths)
build_url = lambda path: urllib.parse.urljoin(base_url, path)
entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
entries = [self.url_result(urljoin(base_url, path), 'CondeNast') for path in paths]
return self.playlist_result(entries, playlist_title=title)
def _extract_video_params(self, webpage, display_id):

View File

@@ -456,7 +456,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
}),
}),
**traverse_obj(metadata, {
'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}),
'duration': ('duration_ms', {float_or_none(scale=1000)}),
'timestamp': ('upload_date', {parse_iso8601}),
'series': ('series_title', {str}),
'series_id': ('series_id', {str}),
@@ -484,7 +484,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
}),
}),
**traverse_obj(metadata, {
'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}),
'duration': ('duration_ms', {float_or_none(scale=1000)}),
'age_limit': ('maturity_ratings', -1, {parse_age_limit}),
}),
}

View File

@@ -6,12 +6,37 @@ from ..utils import (
parse_iso8601,
smuggle_url,
str_or_none,
update_url_query,
)
class CWTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cw(?:tv(?:pr)?|seed)\.com/(?:shows/)?(?:[^/]+/)+[^?]*\?.*\b(?:play|watch)=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
_TESTS = [{
'url': 'https://www.cwtv.com/shows/all-american-homecoming/ready-or-not/?play=d848488f-f62a-40fd-af1f-6440b1821aab',
'info_dict': {
'id': 'd848488f-f62a-40fd-af1f-6440b1821aab',
'ext': 'mp4',
'title': 'Ready Or Not',
'description': 'Simone is concerned about changes taking place at Bringston; JR makes a decision about his future.',
'thumbnail': r're:^https?://.*\.jpe?g$',
'duration': 2547,
'timestamp': 1720519200,
'uploader': 'CWTV',
'chapters': 'count:6',
'series': 'All American: Homecoming',
'season_number': 3,
'episode_number': 1,
'age_limit': 0,
'upload_date': '20240709',
'season': 'Season 3',
'episode': 'Episode 1',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63',
'info_dict': {
'id': '6b15e985-9345-4f60-baf8-56e96be57c63',
@@ -69,13 +94,14 @@ class CWTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
'http://images.cwtv.com/feed/mobileapp/video-meta/apiversion_8/guid_' + video_id,
video_id)
f'https://images.cwtv.com/feed/mobileapp/video-meta/apiversion_12/guid_{video_id}', video_id)
if data.get('result') != 'ok':
raise ExtractorError(data['msg'], expected=True)
video_data = data['video']
title = video_data['title']
mpx_url = video_data.get('mpx_url') or f'http://link.theplatform.com/s/cwtv/media/guid/2703454149/{video_id}?formats=M3U'
mpx_url = update_url_query(
video_data.get('mpx_url') or f'https://link.theplatform.com/s/cwtv/media/guid/2703454149/{video_id}',
{'formats': 'M3U+none'})
season = str_or_none(video_data.get('season'))
episode = str_or_none(video_data.get('episode'))

View File

@@ -2,6 +2,7 @@ from .common import InfoExtractor
from ..utils import (
determine_protocol,
int_or_none,
join_nonempty,
try_get,
unescapeHTML,
)
@@ -52,7 +53,7 @@ class DailyMailIE(InfoExtractor):
is_hls = container == 'M2TS'
protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
formats.append({
'format_id': ('hls' if is_hls else protocol) + (f'-{tbr}' if tbr else ''),
'format_id': join_nonempty('hls' if is_hls else protocol, tbr),
'url': rendition_url,
'width': int_or_none(rendition.get('frameWidth')),
'height': int_or_none(rendition.get('frameHeight')),

View File

@@ -10,11 +10,14 @@ from ..utils import (
OnDemandPagedList,
age_restricted,
clean_html,
extract_attributes,
int_or_none,
traverse_obj,
try_get,
unescapeHTML,
unsmuggle_url,
update_url,
url_or_none,
urlencode_postdata,
)
@@ -98,12 +101,20 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'''(?ix)
https?://
(?:
dai\.ly/|
(?:
(?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player(?:/\w+)?\.html\?)?video|swf)|
(?:www\.)?lequipe\.fr/video
(?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}|
(?:www\.)?lequipe\.fr
)/
(?:
swf/(?!video)|
(?:(?:crawler|embed|swf)/)?video/|
player(?:/[\da-z]+)?\.html\?(?:video|(?P<is_playlist>playlist))=
)
[/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
'''
)
(?P<id>[^/?_&#]+)(?:[\w-]*\?playlist=(?P<playlist_id>x[0-9a-z]+))?
'''
IE_NAME = 'dailymotion'
_EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1']
_TESTS = [{
@@ -123,7 +134,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'view_count': int,
'like_count': int,
'tags': ['hollywood', 'celeb', 'celebrity', 'movies', 'red carpet'],
'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/K456B1aXqIx58LKWQ/x1080',
'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/K456B1cmt4ZcZ9KiM/x1080',
},
}, {
'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true',
@@ -142,7 +153,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'view_count': int,
'like_count': int,
'tags': ['en_quete_d_esprit'],
'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Tncwi1YNg_RUl7ueu/x1080',
'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Tncwi1clTH6StrxMP/x1080',
},
}, {
'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
@@ -217,6 +228,66 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
}, {
'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video',
'only_matching': True,
}, { # playlist-only
'url': 'https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj',
'only_matching': True,
}, {
'url': 'https://geo.dailymotion.com/player/xmyye.html?video=x93blhi',
'only_matching': True,
}, {
'url': 'https://www.dailymotion.com/crawler/video/x8u4owg',
'only_matching': True,
}, {
'url': 'https://www.dailymotion.com/embed/video/x8u4owg',
'only_matching': True,
}, {
'url': 'https://dai.ly/x94cnnk',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
# https://geo.dailymotion.com/player/xmyye.html?video=x93blhi
'url': 'https://www.financialounge.com/video/2024/08/01/borse-europee-in-rosso-dopo-la-fed-a-milano-volano-mediobanca-e-tim-edizione-del-1-agosto/',
'info_dict': {
'id': 'x93blhi',
'ext': 'mp4',
'title': 'OnAir - 01/08/24',
'description': '',
'duration': 217,
'timestamp': 1722505658,
'upload_date': '20240801',
'uploader': 'Financialounge',
'uploader_id': 'x2vtgmm',
'age_limit': 0,
'tags': [],
'view_count': int,
'like_count': int,
},
}, {
# https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj
'url': 'https://www.cycleworld.com/blogs/ask-kevin/ducati-continues-to-evolve-with-v4/',
'info_dict': {
'id': 'x7wdsj',
},
'playlist_mincount': 50,
}, {
# https://www.dailymotion.com/crawler/video/x8u4owg
'url': 'https://www.leparisien.fr/environnement/video-le-veloto-la-voiture-a-pedales-qui-aimerait-se-faire-une-place-sur-les-routes-09-03-2024-KCYMCPM4WFHJXMSKBUI66UNFPU.php',
'info_dict': {
'id': 'x8u4owg',
'ext': 'mp4',
'like_count': int,
'uploader': 'Le Parisien',
'thumbnail': 'https://www.leparisien.fr/resizer/ho_GwveeYftNkLwg_cEta--5Bv4=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/BFXJNEBN75EUNHGYJLORUC3TX4.jpg',
'upload_date': '20240309',
'view_count': int,
'timestamp': 1709997866,
'age_limit': 0,
'uploader_id': 'x32f7b',
'title': 'VIDÉO. Le «\xa0véloto\xa0», la voiture à pédales qui aimerait se faire une place sur les routes',
'duration': 428.0,
'description': 'À bord du « véloto », lalternative à la voiture pour la campagne',
'tags': ['biclou', 'vélo', 'véloto', 'campagne', 'voiture', 'environnement', 'véhicules intermédiaires'],
},
}]
_GEO_BYPASS = False
_COMMON_MEDIA_FIELDS = '''description
@@ -232,16 +303,35 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
for mobj in re.finditer(
r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage):
yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id')
for mobj in re.finditer(
r'(?s)<script [^>]*\bsrc=(["\'])(?:https?:)?//[\w-]+\.dailymotion\.com/player/(?:(?!\1).)+\1[^>]*>', webpage):
attrs = extract_attributes(mobj.group(0))
player_url = url_or_none(attrs.get('src'))
if not player_url:
continue
player_url = player_url.replace('.js', '.html')
if player_url.startswith('//'):
player_url = f'https:{player_url}'
if video_id := attrs.get('data-video'):
query_string = f'video={video_id}'
elif playlist_id := attrs.get('data-playlist'):
query_string = f'playlist={playlist_id}'
else:
continue
yield update_url(player_url, query=query_string)
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url)
video_id, playlist_id = self._match_valid_url(url).groups()
video_id, is_playlist, playlist_id = self._match_valid_url(url).group('id', 'is_playlist', 'playlist_id')
if playlist_id:
if self._yes_playlist(playlist_id, video_id):
return self.url_result(
'http://www.dailymotion.com/playlist/' + playlist_id,
'DailymotionPlaylist', playlist_id)
if is_playlist: # We matched the playlist query param as video_id
playlist_id = video_id
video_id = None
if self._yes_playlist(playlist_id, video_id):
return self.url_result(
f'http://www.dailymotion.com/playlist/{playlist_id}',
'DailymotionPlaylist', playlist_id)
password = self.get_param('videopassword')
media = self._call_api(
@@ -282,6 +372,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
title = metadata['title']
is_live = media.get('isOnAir')
formats = []
subtitles = {}
for quality, media_list in metadata['qualities'].items():
for m in media_list:
media_url = m.get('url')
@@ -289,8 +381,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if not media_url or media_type == 'application/vnd.lumberjack.manifest':
continue
if media_type == 'application/x-mpegURL':
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False))
fmt, subs = self._extract_m3u8_formats_and_subtitles(
media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)
formats.extend(fmt)
self._merge_subtitles(subs, target=subtitles)
else:
f = {
'url': media_url,
@@ -310,20 +404,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if not f.get('fps') and f['format_id'].endswith('@60'):
f['fps'] = 60
subtitles = {}
subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {}
for subtitle_lang, subtitle in subtitles_data.items():
subtitles[subtitle_lang] = [{
'url': subtitle_url,
} for subtitle_url in subtitle.get('urls', [])]
thumbnails = []
for height, poster_url in metadata.get('posters', {}).items():
thumbnails.append({
'height': int_or_none(height),
'id': height,
'url': poster_url,
})
thumbnails = traverse_obj(metadata, (
('posters', 'thumbnails'), {dict.items}, lambda _, v: url_or_none(v[1]), {
'height': (0, {int_or_none}),
'id': (0, {str}),
'url': 1,
}))
owner = metadata.get('owner') or {}
stats = media.get('stats') or {}
@@ -447,7 +539,7 @@ class DailymotionSearchIE(DailymotionPlaylistBaseIE):
class DailymotionUserIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search|crawler)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {

View File

@@ -40,7 +40,7 @@ class DangalPlayBaseIE(InfoExtractor):
'id': ('content_id', {str}),
'title': ('display_title', {str}),
'episode': ('title', {str}),
'series': ('show_name', {str}, {lambda x: x or None}),
'series': ('show_name', {str}, filter),
'series_id': ('catalog_id', {str}),
'duration': ('duration', {int_or_none}),
'release_timestamp': ('release_date_uts', {int_or_none}),

View File

@@ -1,17 +1,20 @@
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
parse_resolution,
traverse_obj,
parse_codecs,
try_get,
url_or_none,
urlencode_postdata,
)
from ..utils.traversal import traverse_obj
class DigitalConcertHallIE(InfoExtractor):
IE_DESC = 'DigitalConcertHall extractor'
_VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/(?P<type>film|concert)/(?P<id>[0-9]+)'
_VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/(?P<type>film|concert|work)/(?P<id>[0-9]+)-?(?P<part>[0-9]+)?'
_OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token'
_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15'
_ACCESS_TOKEN = None
_NETRC_MACHINE = 'digitalconcerthall'
_TESTS = [{
@@ -26,7 +29,8 @@ class DigitalConcertHallIE(InfoExtractor):
'upload_date': '20210624',
'timestamp': 1624548600,
'duration': 2798,
'album_artist': 'Members of the Berliner Philharmoniker / Simon Rössler',
'album_artists': ['Members of the Berliner Philharmoniker', 'Simon Rössler'],
'composers': ['Kurt Weill'],
},
'params': {'skip_download': 'm3u8'},
}, {
@@ -34,8 +38,9 @@ class DigitalConcertHallIE(InfoExtractor):
'url': 'https://www.digitalconcerthall.com/en/concert/53785',
'info_dict': {
'id': '53785',
'album_artist': 'Berliner Philharmoniker / Kirill Petrenko',
'album_artists': ['Berliner Philharmoniker', 'Kirill Petrenko'],
'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich',
'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
'playlist_count': 3,
@@ -49,39 +54,59 @@ class DigitalConcertHallIE(InfoExtractor):
'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$',
'upload_date': '20220714',
'timestamp': 1657785600,
'album_artist': 'Frank Peter Zimmermann / Benedikt von Bernstorff / Jakob von Bernstorff',
'album_artists': ['Frank Peter Zimmermann', 'Benedikt von Bernstorff', 'Jakob von Bernstorff'],
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'Concert with several works and an interview',
'url': 'https://www.digitalconcerthall.com/en/work/53785-1',
'info_dict': {
'id': '53785',
'album_artists': ['Berliner Philharmoniker', 'Kirill Petrenko'],
'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich',
'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
'playlist_count': 1,
}]
def _perform_login(self, username, password):
token_response = self._download_json(
login_token = self._download_json(
self._OAUTH_URL,
None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({
'affiliate': 'none',
'grant_type': 'device',
'device_vendor': 'unknown',
# device_model 'Safari' gets split streams of 4K/HEVC video and lossless/FLAC audio
'device_model': 'unknown' if self._configuration_arg('prefer_combined_hls') else 'Safari',
'app_id': 'dch.webapp',
'app_version': '1.0.0',
'app_distributor': 'berlinphil',
'app_version': '1.84.0',
'client_secret': '2ySLN+2Fwb',
}), headers={
'Content-Type': 'application/x-www-form-urlencoded',
})
self._ACCESS_TOKEN = token_response['access_token']
'Accept': 'application/json',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'User-Agent': self._USER_AGENT,
})['access_token']
try:
self._download_json(
login_response = self._download_json(
self._OAUTH_URL,
None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({
'grant_type': 'password',
'username': username,
'password': password,
}), headers={
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/json',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer': 'https://www.digitalconcerthall.com',
'Authorization': f'Bearer {self._ACCESS_TOKEN}',
'Authorization': f'Bearer {login_token}',
'User-Agent': self._USER_AGENT,
})
except ExtractorError:
self.raise_login_required(msg='Login info incorrect')
except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 401:
raise ExtractorError('Invalid username or password', expected=True)
raise
self._ACCESS_TOKEN = login_response['access_token']
def _real_initialize(self):
if not self._ACCESS_TOKEN:
@@ -95,17 +120,20 @@ class DigitalConcertHallIE(InfoExtractor):
'Accept': 'application/json',
'Authorization': f'Bearer {self._ACCESS_TOKEN}',
'Accept-Language': language,
'User-Agent': self._USER_AGENT,
})
m3u8_url = traverse_obj(
stream_info, ('channel', lambda k, _: k.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False)
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False)
formats = []
for m3u8_url in traverse_obj(stream_info, ('channel', ..., 'stream', ..., 'url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
for fmt in formats:
if fmt.get('format_note') and fmt.get('vcodec') == 'none':
fmt.update(parse_codecs(fmt['format_note']))
yield {
'id': video_id,
'title': item.get('title'),
'composer': item.get('name_composer'),
'url': m3u8_url,
'formats': formats,
'duration': item.get('duration_total'),
'timestamp': traverse_obj(item, ('date', 'published')),
@@ -119,31 +147,34 @@ class DigitalConcertHallIE(InfoExtractor):
}
def _real_extract(self, url):
language, type_, video_id = self._match_valid_url(url).group('language', 'type', 'id')
language, type_, video_id, part = self._match_valid_url(url).group('language', 'type', 'id', 'part')
if not language:
language = 'en'
thumbnail_url = self._html_search_regex(
r'(https?://images\.digitalconcerthall\.com/cms/thumbnails/.*\.jpg)',
self._download_webpage(url, video_id), 'thumbnail')
thumbnails = [{
'url': thumbnail_url,
**parse_resolution(thumbnail_url),
}]
api_type = 'concert' if type_ == 'work' else type_
vid_info = self._download_json(
f'https://api.digitalconcerthall.com/v2/{type_}/{video_id}', video_id, headers={
f'https://api.digitalconcerthall.com/v2/{api_type}/{video_id}', video_id, headers={
'Accept': 'application/json',
'Accept-Language': language,
'User-Agent': self._USER_AGENT,
'Authorization': f'Bearer {self._ACCESS_TOKEN}',
})
album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '')
videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...))
if type_ == 'work':
videos = [videos[int(part) - 1]]
album_artists = traverse_obj(vid_info, ('_links', 'artist', ..., 'name', {str}))
thumbnail = traverse_obj(vid_info, (
'image', ..., {self._proto_relative_url}, {url_or_none},
{lambda x: x.format(width=0, height=0)}, any)) # NB: 0x0 is the original size
return {
'_type': 'playlist',
'id': video_id,
'title': vid_info.get('title'),
'entries': self._entries(videos, language, thumbnails=thumbnails, album_artist=album_artist, type_=type_),
'thumbnails': thumbnails,
'album_artist': album_artist,
'entries': self._entries(
videos, language, type_, thumbnail=thumbnail, album_artists=album_artists),
'thumbnail': thumbnail,
'album_artists': album_artists,
}

View File

@@ -1,115 +0,0 @@
import random
import string
import urllib.parse
from .discoverygo import DiscoveryGoBaseIE
from ..networking.exceptions import HTTPError
from ..utils import ExtractorError
class DiscoveryIE(DiscoveryGoBaseIE):
_VALID_URL = r'''(?x)https?://
(?P<site>
go\.discovery|
www\.
(?:
investigationdiscovery|
discoverylife|
animalplanet|
ahctv|
destinationamerica|
sciencechannel|
tlc
)|
watch\.
(?:
hgtv|
foodnetwork|
travelchannel|
diynetwork|
cookingchanneltv|
motortrend
)
)\.com/tv-shows/(?P<show_slug>[^/]+)/(?:video|full-episode)s/(?P<id>[^./?#]+)'''
_TESTS = [{
'url': 'https://go.discovery.com/tv-shows/cash-cab/videos/riding-with-matthew-perry',
'info_dict': {
'id': '5a2f35ce6b66d17a5026e29e',
'ext': 'mp4',
'title': 'Riding with Matthew Perry',
'description': 'md5:a34333153e79bc4526019a5129e7f878',
'duration': 84,
},
'params': {
'skip_download': True, # requires ffmpeg
},
}, {
'url': 'https://www.investigationdiscovery.com/tv-shows/final-vision/full-episodes/final-vision',
'only_matching': True,
}, {
'url': 'https://go.discovery.com/tv-shows/alaskan-bush-people/videos/follow-your-own-road',
'only_matching': True,
}, {
# using `show_slug` is important to get the correct video data
'url': 'https://www.sciencechannel.com/tv-shows/mythbusters-on-science/full-episodes/christmas-special',
'only_matching': True,
}]
_GEO_COUNTRIES = ['US']
_GEO_BYPASS = False
_API_BASE_URL = 'https://api.discovery.com/v1/'
def _real_extract(self, url):
site, show_slug, display_id = self._match_valid_url(url).groups()
access_token = None
cookies = self._get_cookies(url)
# prefer Affiliate Auth Token over Anonymous Auth Token
auth_storage_cookie = cookies.get('eosAf') or cookies.get('eosAn')
if auth_storage_cookie and auth_storage_cookie.value:
auth_storage = self._parse_json(urllib.parse.unquote(
urllib.parse.unquote(auth_storage_cookie.value)),
display_id, fatal=False) or {}
access_token = auth_storage.get('a') or auth_storage.get('access_token')
if not access_token:
access_token = self._download_json(
f'https://{site}.com/anonymous', display_id,
'Downloading token JSON metadata', query={
'authRel': 'authorization',
'client_id': '3020a40c2356a645b4b4',
'nonce': ''.join(random.choices(string.ascii_letters, k=32)),
'redirectUri': 'https://www.discovery.com/',
})['access_token']
headers = self.geo_verification_headers()
headers['Authorization'] = 'Bearer ' + access_token
try:
video = self._download_json(
self._API_BASE_URL + 'content/videos',
display_id, 'Downloading content JSON metadata',
headers=headers, query={
'embed': 'show.name',
'fields': 'authenticated,description.detailed,duration,episodeNumber,id,name,parental.rating,season.number,show,tags',
'slug': display_id,
'show_slug': show_slug,
})[0]
video_id = video['id']
stream = self._download_json(
self._API_BASE_URL + 'streaming/video/' + video_id,
display_id, 'Downloading streaming JSON metadata', headers=headers)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
e_description = self._parse_json(
e.cause.response.read().decode(), display_id)['description']
if 'resource not available for country' in e_description:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
if 'Authorized Networks' in e_description:
raise ExtractorError(
'This video is only available via cable service provider subscription that'
' is not currently supported. You may want to use --cookies.', expected=True)
raise ExtractorError(e_description)
raise
return self._extract_video_info(video, stream, display_id)

View File

@@ -1,171 +0,0 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext,
extract_attributes,
int_or_none,
parse_age_limit,
remove_end,
unescapeHTML,
url_or_none,
)
class DiscoveryGoBaseIE(InfoExtractor):
_VALID_URL_TEMPLATE = r'''(?x)https?://(?:www\.)?(?:
discovery|
investigationdiscovery|
discoverylife|
animalplanet|
ahctv|
destinationamerica|
sciencechannel|
tlc|
velocitychannel
)go\.com/%s(?P<id>[^/?#&]+)'''
def _extract_video_info(self, video, stream, display_id):
title = video['name']
if not stream:
if video.get('authenticated') is True:
raise ExtractorError(
'This video is only available via cable service provider subscription that'
' is not currently supported. You may want to use --cookies.', expected=True)
else:
raise ExtractorError('Unable to find stream')
STREAM_URL_SUFFIX = 'streamUrl'
formats = []
for stream_kind in ('', 'hds'):
suffix = STREAM_URL_SUFFIX.capitalize() if stream_kind else STREAM_URL_SUFFIX
stream_url = stream.get(f'{stream_kind}{suffix}')
if not stream_url:
continue
if stream_kind == '':
formats.extend(self._extract_m3u8_formats(
stream_url, display_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
elif stream_kind == 'hds':
formats.extend(self._extract_f4m_formats(
stream_url, display_id, f4m_id=stream_kind, fatal=False))
video_id = video.get('id') or display_id
description = video.get('description', {}).get('detailed')
duration = int_or_none(video.get('duration'))
series = video.get('show', {}).get('name')
season_number = int_or_none(video.get('season', {}).get('number'))
episode_number = int_or_none(video.get('episodeNumber'))
tags = video.get('tags')
age_limit = parse_age_limit(video.get('parental', {}).get('rating'))
subtitles = {}
captions = stream.get('captions')
if isinstance(captions, list):
for caption in captions:
subtitle_url = url_or_none(caption.get('fileUrl'))
if not subtitle_url or not subtitle_url.startswith('http'):
continue
lang = caption.get('fileLang', 'en')
ext = determine_ext(subtitle_url)
subtitles.setdefault(lang, []).append({
'url': subtitle_url,
'ext': 'ttml' if ext == 'xml' else ext,
})
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'duration': duration,
'series': series,
'season_number': season_number,
'episode_number': episode_number,
'tags': tags,
'age_limit': age_limit,
'formats': formats,
'subtitles': subtitles,
}
class DiscoveryGoIE(DiscoveryGoBaseIE):
_VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+'
_GEO_COUNTRIES = ['US']
_TEST = {
'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/',
'info_dict': {
'id': '58c167d86b66d12f2addeb01',
'ext': 'mp4',
'title': 'Reaper Madness',
'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78',
'duration': 2519,
'series': 'Bering Sea Gold',
'season_number': 8,
'episode_number': 6,
'age_limit': 14,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
container = extract_attributes(
self._search_regex(
r'(<div[^>]+class=["\']video-player-container[^>]+>)',
webpage, 'video container'))
video = self._parse_json(
container.get('data-video') or container.get('data-json'),
display_id)
stream = video.get('stream')
return self._extract_video_info(video, stream, display_id)
class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE):
_VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % ''
_TEST = {
'url': 'https://www.discoverygo.com/bering-sea-gold/',
'info_dict': {
'id': 'bering-sea-gold',
'title': 'Bering Sea Gold',
'description': 'md5:cc5c6489835949043c0cc3ad66c2fa0e',
},
'playlist_mincount': 6,
}
@classmethod
def suitable(cls, url):
return False if DiscoveryGoIE.suitable(url) else super().suitable(url)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
entries = []
for mobj in re.finditer(r'data-json=(["\'])(?P<json>{.+?})\1', webpage):
data = self._parse_json(
mobj.group('json'), display_id,
transform_source=unescapeHTML, fatal=False)
if not isinstance(data, dict) or data.get('type') != 'episode':
continue
episode_url = data.get('socialUrl')
if not episode_url:
continue
entries.append(self.url_result(
episode_url, ie=DiscoveryGoIE.ie_key(),
video_id=data.get('id')))
return self.playlist_result(
entries, display_id,
remove_end(self._og_search_title(
webpage, fatal=False), ' | Discovery GO'),
self._og_search_description(webpage))

View File

@@ -24,8 +24,9 @@ from ..utils import (
class DouyuBaseIE(InfoExtractor):
def _download_cryptojs_md5(self, video_id):
for url in [
# XXX: Do NOT use cdn.bootcdn.net; ref: https://sansec.io/research/polyfill-supply-chain-attack
'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
'https://unpkg.com/cryptojslib@3.1.2/rollups/md5.js',
]:
js_code = self._download_webpage(
url, video_id, note='Downloading signing dependency', fatal=False)
@@ -35,7 +36,8 @@ class DouyuBaseIE(InfoExtractor):
raise ExtractorError('Unable to download JS dependency (crypto-js/md5)')
def _get_cryptojs_md5(self, video_id):
return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id)
return self.cache.load(
'douyu', 'crypto-js-md5', min_ver='2024.07.04') or self._download_cryptojs_md5(video_id)
def _calc_sign(self, sign_func, video_id, a):
b = uuid.uuid4().hex

View File

@@ -319,35 +319,17 @@ class DPlayIE(DPlayBaseIE):
url, display_id, host, 'dplay' + country, country, domain)
class HGTVDeIE(DPlayBaseIE):
_VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
'info_dict': {
'id': '151205',
'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
'ext': 'mp4',
'title': 'Wer braucht schon eine Toilette',
'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
'duration': 1177.024,
'timestamp': 1595705400,
'upload_date': '20200725',
'creator': 'HGTV',
'series': 'Tiny House - klein, aber oho',
'season_number': 3,
'episode_number': 3,
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
return self._get_disco_api_info(
url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
class DiscoveryPlusBaseIE(DPlayBaseIE):
"""Subclasses must set _PRODUCT, _DISCO_API_PARAMS"""
_DISCO_CLIENT_VER = '27.43.0'
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers['x-disco-client'] = f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6'
headers.update({
'x-disco-params': f'realm={realm},siteLookupKey={self._PRODUCT}',
'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:{self._DISCO_CLIENT_VER}',
'Authorization': self._get_auth(disco_base, display_id, realm),
})
def _download_video_playback_info(self, disco_base, video_id, headers):
return self._download_json(
@@ -365,9 +347,68 @@ class DiscoveryPlusBaseIE(DPlayBaseIE):
return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS)
class HGTVDeIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://de.hgtv.com/sendungen/mein-kleinstadt-traumhaus/vom-landleben-ins-loft',
'info_dict': {
'id': '7332936',
'ext': 'mp4',
'display_id': 'mein-kleinstadt-traumhaus/vom-landleben-ins-loft',
'title': 'Vom Landleben ins Loft',
'description': 'md5:e5f72c02c853970796dd3818f2e25745',
'episode': 'Episode 7',
'episode_number': 7,
'season': 'Season 7',
'season_number': 7,
'series': 'Mein Kleinstadt-Traumhaus',
'duration': 2645.0,
'timestamp': 1725998100,
'upload_date': '20240910',
'creators': ['HGTV'],
'tags': [],
'thumbnail': 'https://eu1-prod-images.disco-api.com/2024/08/09/82a386b9-c688-32c7-b9ff-0b13865f0bae.jpeg',
},
}]
_PRODUCT = 'hgtv'
_DISCO_API_PARAMS = {
'disco_host': 'eu1-prod.disco-api.com',
'realm': 'hgtv',
'country': 'de',
}
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers.update({
'x-disco-params': f'realm={realm}',
'x-disco-client': 'Alps:HyogaPlayer:0.0.0',
'Authorization': self._get_auth(disco_base, display_id, realm),
})
class GoDiscoveryIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://go.discovery.com/video/in-the-eye-of-the-storm-discovery-atve-us/trapped-in-a-twister',
'info_dict': {
'id': '5352642',
'display_id': 'in-the-eye-of-the-storm-discovery-atve-us/trapped-in-a-twister',
'ext': 'mp4',
'title': 'Trapped in a Twister',
'description': 'Twisters destroy Midwest towns, trapping spotters in the eye of the storm.',
'episode_number': 1,
'episode': 'Episode 1',
'season_number': 1,
'season': 'Season 1',
'series': 'In The Eye Of The Storm',
'duration': 2490.237,
'upload_date': '20240715',
'timestamp': 1721008800,
'tags': [],
'creators': ['Discovery'],
'thumbnail': 'https://us1-prod-images.disco-api.com/2024/07/10/5e39637d-cabf-3ab3-8e9a-f4e9d37bc036.jpeg',
},
}, {
'url': 'https://go.discovery.com/video/dirty-jobs-discovery-atve-us/rodbuster-galvanizer',
'info_dict': {
'id': '4164906',
@@ -395,6 +436,26 @@ class GoDiscoveryIE(DiscoveryPlusBaseIE):
class TravelChannelIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:watch\.)?travelchannel\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://watch.travelchannel.com/video/the-dead-files-travel-channel/protect-the-children',
'info_dict': {
'id': '4710177',
'display_id': 'the-dead-files-travel-channel/protect-the-children',
'ext': 'mp4',
'title': 'Protect the Children',
'description': 'An evil presence threatens an Ohio woman\'s children and marriage.',
'season_number': 14,
'season': 'Season 14',
'episode_number': 10,
'episode': 'Episode 10',
'series': 'The Dead Files',
'duration': 2550.481,
'timestamp': 1664510400,
'upload_date': '20220930',
'tags': [],
'creators': ['Travel Channel'],
'thumbnail': 'https://us1-prod-images.disco-api.com/2022/03/17/5e45eace-de5d-343a-9293-f400a2aa77d5.jpeg',
},
}, {
'url': 'https://watch.travelchannel.com/video/ghost-adventures-travel-channel/ghost-train-of-ely',
'info_dict': {
'id': '2220256',
@@ -422,6 +483,26 @@ class TravelChannelIE(DiscoveryPlusBaseIE):
class CookingChannelIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:watch\.)?cookingchanneltv\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://watch.cookingchanneltv.com/video/bobbys-triple-threat-food-network-atve-us/titans-vs-marcus-samuelsson',
'info_dict': {
'id': '5350005',
'ext': 'mp4',
'display_id': 'bobbys-triple-threat-food-network-atve-us/titans-vs-marcus-samuelsson',
'title': 'Titans vs Marcus Samuelsson',
'description': 'Marcus Samuelsson throws his legendary global tricks at the Titans.',
'episode_number': 1,
'episode': 'Episode 1',
'season_number': 3,
'season': 'Season 3',
'series': 'Bobby\'s Triple Threat',
'duration': 2520.851,
'upload_date': '20240710',
'timestamp': 1720573200,
'tags': [],
'creators': ['Food Network'],
'thumbnail': 'https://us1-prod-images.disco-api.com/2024/07/04/529cd095-27ec-35c5-84e9-90ebd3e5d2da.jpeg',
},
}, {
'url': 'https://watch.cookingchanneltv.com/video/carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634',
'info_dict': {
'id': '2348634',
@@ -449,6 +530,22 @@ class CookingChannelIE(DiscoveryPlusBaseIE):
class HGTVUsaIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:watch\.)?hgtv\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://watch.hgtv.com/video/flip-or-flop-the-final-flip-hgtv-atve-us/flip-or-flop-the-final-flip',
'info_dict': {
'id': '5025585',
'display_id': 'flip-or-flop-the-final-flip-hgtv-atve-us/flip-or-flop-the-final-flip',
'ext': 'mp4',
'title': 'Flip or Flop: The Final Flip',
'description': 'Tarek and Christina are going their separate ways after one last flip!',
'series': 'Flip or Flop: The Final Flip',
'duration': 2580.644,
'upload_date': '20231101',
'timestamp': 1698811200,
'tags': [],
'creators': ['HGTV'],
'thumbnail': 'https://us1-prod-images.disco-api.com/2022/11/27/455caa6c-1462-3f14-b63d-a026d7a5e6d3.jpeg',
},
}, {
'url': 'https://watch.hgtv.com/video/home-inspector-joe-hgtv-atve-us/this-mold-house',
'info_dict': {
'id': '4289736',
@@ -476,6 +573,26 @@ class HGTVUsaIE(DiscoveryPlusBaseIE):
class FoodNetworkIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:watch\.)?foodnetwork\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://watch.foodnetwork.com/video/guys-grocery-games-food-network/wild-in-the-aisles',
'info_dict': {
'id': '2152549',
'display_id': 'guys-grocery-games-food-network/wild-in-the-aisles',
'ext': 'mp4',
'title': 'Wild in the Aisles',
'description': 'The chefs make spaghetti and meatballs with "Out of Stock" ingredients.',
'season_number': 1,
'season': 'Season 1',
'episode_number': 1,
'episode': 'Episode 1',
'series': 'Guy\'s Grocery Games',
'tags': [],
'creators': ['Food Network'],
'duration': 2520.651,
'upload_date': '20230623',
'timestamp': 1687492800,
'thumbnail': 'https://us1-prod-images.disco-api.com/2022/06/15/37fb5333-cad2-3dbb-af7c-c20ec77c89c6.jpeg',
},
}, {
'url': 'https://watch.foodnetwork.com/video/kids-baking-championship-food-network/float-like-a-butterfly',
'info_dict': {
'id': '4116449',
@@ -503,6 +620,26 @@ class FoodNetworkIE(DiscoveryPlusBaseIE):
class DestinationAmericaIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?destinationamerica\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.destinationamerica.com/video/bbq-pit-wars-destination-america/smoke-on-the-water',
'info_dict': {
'id': '2218409',
'display_id': 'bbq-pit-wars-destination-america/smoke-on-the-water',
'ext': 'mp4',
'title': 'Smoke on the Water',
'description': 'The pitmasters head to Georgia for the Smoke on the Water BBQ Festival.',
'season_number': 2,
'season': 'Season 2',
'episode_number': 1,
'episode': 'Episode 1',
'series': 'BBQ Pit Wars',
'tags': [],
'creators': ['Destination America'],
'duration': 2614.878,
'upload_date': '20230623',
'timestamp': 1687492800,
'thumbnail': 'https://us1-prod-images.disco-api.com/2020/05/11/c0f8e85d-9a10-3e6f-8e43-f6faafa81ba2.jpeg',
},
}, {
'url': 'https://www.destinationamerica.com/video/alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot',
'info_dict': {
'id': '4210904',
@@ -530,6 +667,26 @@ class DestinationAmericaIE(DiscoveryPlusBaseIE):
class InvestigationDiscoveryIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?investigationdiscovery\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.investigationdiscovery.com/video/deadly-influence-the-social-media-murders-investigation-discovery-atve-us/rip-bianca',
'info_dict': {
'id': '5341132',
'display_id': 'deadly-influence-the-social-media-murders-investigation-discovery-atve-us/rip-bianca',
'ext': 'mp4',
'title': 'RIP Bianca',
'description': 'A teenage influencer discovers an online world of threat, harm and danger.',
'season_number': 1,
'season': 'Season 1',
'episode_number': 3,
'episode': 'Episode 3',
'series': 'Deadly Influence: The Social Media Murders',
'creators': ['Investigation Discovery'],
'tags': [],
'duration': 2490.888,
'upload_date': '20240618',
'timestamp': 1718672400,
'thumbnail': 'https://us1-prod-images.disco-api.com/2024/06/15/b567c774-9e44-3c6c-b0ba-db860a73e812.jpeg',
},
}, {
'url': 'https://www.investigationdiscovery.com/video/unmasked-investigation-discovery/the-killer-clown',
'info_dict': {
'id': '2139409',
@@ -557,6 +714,26 @@ class InvestigationDiscoveryIE(DiscoveryPlusBaseIE):
class AmHistoryChannelIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?ahctv\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.ahctv.com/video/blood-and-fury-americas-civil-war-ahc/battle-of-bull-run',
'info_dict': {
'id': '2139199',
'display_id': 'blood-and-fury-americas-civil-war-ahc/battle-of-bull-run',
'ext': 'mp4',
'title': 'Battle of Bull Run',
'description': 'Two untested armies clash in the first real battle of the Civil War.',
'season_number': 1,
'season': 'Season 1',
'episode_number': 1,
'episode': 'Episode 1',
'series': 'Blood and Fury: America\'s Civil War',
'duration': 2612.509,
'upload_date': '20220923',
'timestamp': 1663905600,
'creators': ['AHC'],
'tags': [],
'thumbnail': 'https://us1-prod-images.disco-api.com/2020/05/11/4af61bd7-d705-3108-82c4-1a6e541e20fa.jpeg',
},
}, {
'url': 'https://www.ahctv.com/video/modern-sniper-ahc/army',
'info_dict': {
'id': '2309730',
@@ -584,6 +761,26 @@ class AmHistoryChannelIE(DiscoveryPlusBaseIE):
class ScienceChannelIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.sciencechannel.com/video/spaces-deepest-secrets-science-atve-us/mystery-of-the-dead-planets',
'info_dict': {
'id': '2347335',
'display_id': 'spaces-deepest-secrets-science-atve-us/mystery-of-the-dead-planets',
'ext': 'mp4',
'title': 'Mystery of the Dead Planets',
'description': 'Astronomers unmask the truly destructive nature of the cosmos.',
'season_number': 7,
'season': 'Season 7',
'episode_number': 1,
'episode': 'Episode 1',
'series': 'Space\'s Deepest Secrets',
'duration': 2524.989,
'upload_date': '20230128',
'timestamp': 1674882000,
'creators': ['Science'],
'tags': [],
'thumbnail': 'https://us1-prod-images.disco-api.com/2021/03/30/3796829d-aead-3f9a-bd8d-e49048b3cdca.jpeg',
},
}, {
'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine',
'info_dict': {
'id': '2842849',
@@ -608,36 +805,29 @@ class ScienceChannelIE(DiscoveryPlusBaseIE):
}
class DIYNetworkIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas',
'info_dict': {
'id': '2309730',
'display_id': 'pool-kings-diy-network/bringing-beach-life-to-texas',
'ext': 'mp4',
'title': 'Bringing Beach Life to Texas',
'description': 'The Pool Kings give a family a day at the beach in their own backyard.',
'season_number': 10,
'episode_number': 2,
},
'skip': 'Available for Premium users',
}, {
'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas',
'only_matching': True,
}]
_PRODUCT = 'diy'
_DISCO_API_PARAMS = {
'disco_host': 'us1-prod-direct.watch.diynetwork.com',
'realm': 'go',
'country': 'us',
}
class DiscoveryLifeIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?discoverylife\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.discoverylife.com/video/er-files-discovery-life-atve-us/sweet-charity',
'info_dict': {
'id': '2347614',
'display_id': 'er-files-discovery-life-atve-us/sweet-charity',
'ext': 'mp4',
'title': 'Sweet Charity',
'description': 'The staff at Charity Hospital treat a serious foot infection.',
'season_number': 1,
'season': 'Season 1',
'episode_number': 1,
'episode': 'Episode 1',
'series': 'ER Files',
'duration': 2364.261,
'upload_date': '20230721',
'timestamp': 1689912000,
'creators': ['Discovery Life'],
'tags': [],
'thumbnail': 'https://us1-prod-images.disco-api.com/2021/03/16/4b6f0124-360b-3546-b6a4-5552db886b86.jpeg',
},
}, {
'url': 'https://www.discoverylife.com/video/surviving-death-discovery-life-atve-us/bodily-trauma',
'info_dict': {
'id': '2218238',
@@ -665,6 +855,26 @@ class DiscoveryLifeIE(DiscoveryPlusBaseIE):
class AnimalPlanetIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.animalplanet.com/video/mysterious-creatures-with-forrest-galante-animal-planet-atve-us/the-demon-of-peru',
'info_dict': {
'id': '4650835',
'display_id': 'mysterious-creatures-with-forrest-galante-animal-planet-atve-us/the-demon-of-peru',
'ext': 'mp4',
'title': 'The Demon of Peru',
'description': 'In Peru, a farming village is being terrorized by a “man-like beast.”',
'season_number': 1,
'season': 'Season 1',
'episode_number': 4,
'episode': 'Episode 4',
'series': 'Mysterious Creatures with Forrest Galante',
'duration': 2490.488,
'upload_date': '20230111',
'timestamp': 1673413200,
'creators': ['Animal Planet'],
'tags': [],
'thumbnail': 'https://us1-prod-images.disco-api.com/2022/03/01/6dbaa833-9a2e-3fee-9381-c19eddf67c0c.jpeg',
},
}, {
'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown',
'info_dict': {
'id': '3338923',
@@ -692,6 +902,26 @@ class AnimalPlanetIE(DiscoveryPlusBaseIE):
class TLCIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:go\.)?tlc\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://go.tlc.com/video/90-day-the-last-resort-tlc-atve-us/the-last-chance',
'info_dict': {
'id': '5186422',
'display_id': '90-day-the-last-resort-tlc-atve-us/the-last-chance',
'ext': 'mp4',
'title': 'The Last Chance',
'description': 'Infidelity shakes Kalani and Asuelu\'s world, and Angela threatens divorce.',
'season_number': 1,
'season': 'Season 1',
'episode_number': 1,
'episode': 'Episode 1',
'series': '90 Day: The Last Resort',
'duration': 5123.91,
'upload_date': '20230815',
'timestamp': 1692061200,
'creators': ['TLC'],
'tags': [],
'thumbnail': 'https://us1-prod-images.disco-api.com/2023/08/08/0ee367e2-ac76-334d-bf23-dbf796696a24.jpeg',
},
}, {
'url': 'https://go.tlc.com/video/my-600-lb-life-tlc/melissas-story-part-1',
'info_dict': {
'id': '2206540',
@@ -716,93 +946,8 @@ class TLCIE(DiscoveryPlusBaseIE):
}
class MotorTrendIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:watch\.)?motortrend\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://watch.motortrend.com/video/car-issues-motortrend-atve-us/double-dakotas',
'info_dict': {
'id': '"4859182"',
'display_id': 'double-dakotas',
'ext': 'mp4',
'title': 'Double Dakotas',
'description': 'Tylers buy-one-get-one Dakota deal has the Wizard pulling double duty.',
'season_number': 2,
'episode_number': 3,
},
'skip': 'Available for Premium users',
}, {
'url': 'https://watch.motortrend.com/video/car-issues-motortrend-atve-us/double-dakotas',
'only_matching': True,
}]
_PRODUCT = 'vel'
_DISCO_API_PARAMS = {
'disco_host': 'us1-prod-direct.watch.motortrend.com',
'realm': 'go',
'country': 'us',
}
class MotorTrendOnDemandIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?motortrend(?:ondemand\.com|\.com/plus)/detail' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784',
'info_dict': {
'id': '37699',
'display_id': 'wheelstanding-dump-truck-stubby-bobs-comeback/37699',
'ext': 'mp4',
'title': 'Wheelstanding Dump Truck! Stubby Bobs Comeback',
'description': 'md5:996915abe52a1c3dfc83aecea3cce8e7',
'season_number': 5,
'episode_number': 52,
'episode': 'Episode 52',
'season': 'Season 5',
'thumbnail': r're:^https?://.+\.jpe?g$',
'timestamp': 1388534401,
'duration': 1887.345,
'creator': 'Originals',
'series': 'Roadkill',
'upload_date': '20140101',
'tags': [],
},
}, {
'url': 'https://www.motortrend.com/plus/detail/roadworthy-rescues-teaser-trailer/4922860/',
'info_dict': {
'id': '4922860',
'ext': 'mp4',
'title': 'Roadworthy Rescues | Teaser Trailer',
'description': 'Derek Bieri helps Freiburger and Finnegan with their \'68 big-block Dart.',
'display_id': 'roadworthy-rescues-teaser-trailer/4922860',
'creator': 'Originals',
'series': 'Roadworthy Rescues',
'thumbnail': r're:^https?://.+\.jpe?g$',
'upload_date': '20220907',
'timestamp': 1662523200,
'duration': 1066.356,
'tags': [],
},
}, {
'url': 'https://www.motortrend.com/plus/detail/ugly-duckling/2450033/12439',
'only_matching': True,
}]
_PRODUCT = 'MTOD'
_DISCO_API_PARAMS = {
'disco_host': 'us1-prod-direct.motortrendondemand.com',
'realm': 'motortrend',
'country': 'us',
}
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers.update({
'x-disco-params': f'realm={realm}',
'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:4.39.1-gi1',
'Authorization': self._get_auth(disco_base, display_id, realm),
})
class DiscoveryPlusIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:(?P<country>[a-z]{2})/)?video(?:/sport|/olympics)?' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
'info_dict': {
@@ -823,14 +968,45 @@ class DiscoveryPlusIE(DiscoveryPlusBaseIE):
}, {
'url': 'https://discoveryplus.com/ca/video/bering-sea-gold-discovery-ca/goldslingers',
'only_matching': True,
}, {
'url': 'https://www.discoveryplus.com/gb/video/sport/eurosport-1-british-eurosport-1-british-sport/6-hours-of-spa-review',
'only_matching': True,
}, {
'url': 'https://www.discoveryplus.com/gb/video/olympics/dplus-sport-dplus-sport-sport/rugby-sevens-australia-samoa',
'only_matching': True,
}]
_PRODUCT = 'dplus_us'
_DISCO_API_PARAMS = {
'disco_host': 'us1-prod-direct.discoveryplus.com',
'realm': 'go',
'country': 'us',
}
_PRODUCT = None
_DISCO_API_PARAMS = None
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers.update({
'x-disco-params': f'realm={realm},siteLookupKey={self._PRODUCT}',
'x-disco-client': f'WEB:UNKNOWN:dplus_us:{self._DISCO_CLIENT_VER}',
'Authorization': self._get_auth(disco_base, display_id, realm),
})
def _real_extract(self, url):
video_id, country = self._match_valid_url(url).group('id', 'country')
if not country:
country = 'us'
self._PRODUCT = f'dplus_{country}'
if country in ('br', 'ca', 'us'):
self._DISCO_API_PARAMS = {
'disco_host': 'us1-prod-direct.discoveryplus.com',
'realm': 'go',
'country': country,
}
else:
self._DISCO_API_PARAMS = {
'disco_host': 'eu1-prod-direct.discoveryplus.com',
'realm': 'dplay',
'country': country,
}
return self._get_disco_api_info(url, video_id, **self._DISCO_API_PARAMS)
class DiscoveryPlusIndiaIE(DiscoveryPlusBaseIE):
@@ -984,16 +1160,22 @@ class DiscoveryPlusShowBaseIE(DPlayBaseIE):
class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video' + DPlayBaseIE._PATH_REGEX
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video(?:/sport|/olympics)?' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi',
'only_matching': True,
}, {
'url': 'https://www.discoveryplus.com/it/video/super-benny/trailer',
'only_matching': True,
}, {
'url': 'https://www.discoveryplus.com/it/video/olympics/dplus-sport-dplus-sport-sport/water-polo-greece-italy',
'only_matching': True,
}, {
'url': 'https://www.discoveryplus.com/it/video/sport/dplus-sport-dplus-sport-sport/lisa-vittozzi-allinferno-e-ritorno',
'only_matching': True,
}]
_PRODUCT = 'dplus_us'
_PRODUCT = 'dplus_it'
_DISCO_API_PARAMS = {
'disco_host': 'eu1-prod-direct.discoveryplus.com',
'realm': 'dplay',
@@ -1002,8 +1184,8 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE):
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers.update({
'x-disco-params': f'realm={realm}',
'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6',
'x-disco-params': f'realm={realm},siteLookupKey={self._PRODUCT}',
'x-disco-client': f'WEB:UNKNOWN:dplus_us:{self._DISCO_CLIENT_VER}',
'Authorization': self._get_auth(disco_base, display_id, realm),
})
@@ -1044,39 +1226,3 @@ class DiscoveryPlusIndiaShowIE(DiscoveryPlusShowBaseIE):
_SHOW_STR = 'show'
_INDEX = 4
_VIDEO_IE = DiscoveryPlusIndiaIE
class GlobalCyclingNetworkPlusIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://plus\.globalcyclingnetwork\.com/watch/(?P<id>\d+)'
_TESTS = [{
'url': 'https://plus.globalcyclingnetwork.com/watch/1397691',
'info_dict': {
'id': '1397691',
'ext': 'mp4',
'title': 'The Athertons: Mountain Biking\'s Fastest Family',
'description': 'md5:75a81937fcd8b989eec6083a709cd837',
'thumbnail': 'https://us1-prod-images.disco-api.com/2021/03/04/eb9e3026-4849-3001-8281-9356466f0557.png',
'series': 'gcn',
'creator': 'Gcn',
'upload_date': '20210309',
'timestamp': 1615248000,
'duration': 2531.0,
'tags': [],
},
'skip': 'Subscription required',
'params': {'skip_download': 'm3u8'},
}]
_PRODUCT = 'web'
_DISCO_API_PARAMS = {
'disco_host': 'disco-api-prod.globalcyclingnetwork.com',
'realm': 'gcn',
'country': 'us',
}
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers.update({
'x-disco-params': f'realm={realm}',
'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:27.3.2',
'Authorization': self._get_auth(disco_base, display_id, realm),
})

View File

@@ -6,8 +6,10 @@ import urllib.parse
from .common import InfoExtractor
from ..utils import (
ExtractorError,
update_url,
update_url_query,
url_basename,
urlencode_postdata,
)
@@ -36,43 +38,58 @@ class DropboxIE(InfoExtractor):
},
]
def _yield_decoded_parts(self, webpage):
for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)):
yield base64.b64decode(encoded).decode('utf-8', 'ignore')
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
fn = urllib.parse.unquote(url_basename(url))
title = os.path.splitext(fn)[0]
password = self.get_param('videopassword')
if (self._og_search_title(webpage) == 'Dropbox - Password Required'
or 'Enter the password for this link' in webpage):
for part in self._yield_decoded_parts(webpage):
if '/sm/password' in part:
webpage = self._download_webpage(
update_url('https://www.dropbox.com/sm/password', query=part.partition('?')[2]), video_id)
break
if (self._og_search_title(webpage, default=None) == 'Dropbox - Password Required'
or 'Enter the password for this link' in webpage):
if password:
content_id = self._search_regex(r'content_id=(.*?)["\']', webpage, 'content_id')
payload = f'is_xhr=true&t={self._get_cookies("https://www.dropbox.com").get("t").value}&content_id={content_id}&password={password}&url={url}'
response = self._download_json(
'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', data=payload.encode(),
headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'})
'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password',
headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'},
data=urlencode_postdata({
'is_xhr': 'true',
't': self._get_cookies('https://www.dropbox.com')['t'].value,
'content_id': self._search_regex(r'content_id=([\w.+=/-]+)["\']', webpage, 'content id'),
'password': password,
'url': url,
}))
if response.get('status') != 'authed':
raise ExtractorError('Authentication failed!', expected=True)
webpage = self._download_webpage(url, video_id)
elif self._get_cookies('https://dropbox.com').get('sm_auth'):
webpage = self._download_webpage(url, video_id)
else:
raise ExtractorError('Invalid password', expected=True)
elif not self._get_cookies('https://dropbox.com').get('sm_auth'):
raise ExtractorError('Password protected video, use --video-password <password>', expected=True)
webpage = self._download_webpage(url, video_id)
formats, subtitles, has_anonymous_download = [], {}, False
for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)):
decoded = base64.b64decode(encoded).decode('utf-8', 'ignore')
formats, subtitles = [], {}
has_anonymous_download = False
thumbnail = None
for part in self._yield_decoded_parts(webpage):
if not has_anonymous_download:
has_anonymous_download = self._search_regex(
r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False)
r'(anonymous:\tanonymous)', part, 'anonymous', default=False)
transcode_url = self._search_regex(
r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None)
r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', part, 'transcode url', default=None)
if not transcode_url:
continue
formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4')
thumbnail = self._search_regex(
r'(https://www\.dropbox\.com/temp_thumb_from_token/[\w/?&=]+)', part, 'thumbnail', default=None)
break
# downloads enabled we can get the original file
@@ -89,4 +106,5 @@ class DropboxIE(InfoExtractor):
'title': title,
'formats': formats,
'subtitles': subtitles,
'thumbnail': thumbnail,
}

View File

@@ -139,12 +139,11 @@ class DRTVIE(InfoExtractor):
return
token_response = self._download_json(
'https://production.dr-massive.com/api/authorization/anonymous-sso', None,
'https://isl.dr-massive.com/api/authorization/anonymous-sso', None,
note='Downloading anonymous token', headers={
'content-type': 'application/json',
}, query={
'device': 'web_browser',
'ff': 'idp,ldp,rpt',
'device': 'phone_android',
'lang': 'da',
'supportFallbackToken': 'true',
}, data=json.dumps({

View File

@@ -1,6 +1,11 @@
from .common import InfoExtractor
from ..networking import Request
from ..utils import float_or_none, int_or_none, parse_iso8601
from ..utils import (
float_or_none,
int_or_none,
join_nonempty,
parse_iso8601,
)
class EitbIE(InfoExtractor):
@@ -37,12 +42,9 @@ class EitbIE(InfoExtractor):
if not video_url:
continue
tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000)
format_id = 'http'
if tbr:
format_id += f'-{int(tbr)}'
formats.append({
'url': rendition['PMD_URL'],
'format_id': format_id,
'format_id': join_nonempty('http', int_or_none(tbr)),
'width': int_or_none(rendition.get('FRAME_WIDTH')),
'height': int_or_none(rendition.get('FRAME_HEIGHT')),
'tbr': tbr,

View File

@@ -2,6 +2,7 @@ from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
join_nonempty,
orderedSet,
parse_iso8601,
parse_qs,
@@ -13,7 +14,7 @@ from ..utils import (
class EpidemicSoundIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?epidemicsound\.com/track/(?P<id>[0-9a-zA-Z]+)'
_VALID_URL = r'https?://(?:www\.)?epidemicsound\.com/(?:(?P<sfx>sound-effects/tracks)|track)/(?P<id>[0-9a-zA-Z-]+)'
_TESTS = [{
'url': 'https://www.epidemicsound.com/track/yFfQVRpSPz/',
'md5': 'd98ff2ddb49e8acab9716541cbc9dfac',
@@ -47,6 +48,20 @@ class EpidemicSoundIE(InfoExtractor):
'release_timestamp': 1700535606,
'release_date': '20231121',
},
}, {
'url': 'https://www.epidemicsound.com/sound-effects/tracks/2f02f54b-9faa-4daf-abac-1cfe9e9cef69/',
'md5': '35d7cf05bd8b614a84f0495a05de9388',
'info_dict': {
'id': '208931',
'ext': 'mp3',
'upload_date': '20240603',
'timestamp': 1717436529,
'categories': ['appliance'],
'display_id': '6b2NXLURPr',
'duration': 1.0,
'title': 'Oven, Grill, Door Open 01',
'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/default-sfx/3000x3000.jpg',
},
}]
@staticmethod
@@ -77,8 +92,10 @@ class EpidemicSoundIE(InfoExtractor):
return f
def _real_extract(self, url):
video_id = self._match_id(url)
json_data = self._download_json(f'https://www.epidemicsound.com/json/track/{video_id}', video_id)
video_id, is_sfx = self._match_valid_url(url).group('id', 'sfx')
json_data = self._download_json(join_nonempty(
'https://www.epidemicsound.com/json/track',
is_sfx and 'kosmos-id', video_id, delim='/'), video_id)
thumbnails = traverse_obj(json_data, [('imageUrl', 'cover')])
thumb_base_url = traverse_obj(json_data, ('coverArt', 'baseUrl', {url_or_none}))

View File

@@ -29,9 +29,6 @@ class EpornerIE(InfoExtractor):
'view_count': int,
'age_limit': 18,
},
'params': {
'proxy': '127.0.0.1:8118',
},
}, {
# New (May 2016) URL layout
'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/',

View File

@@ -207,7 +207,7 @@ class ERRJupiterIE(InfoExtractor):
**traverse_obj(data, {
'title': ('heading', {str}),
'alt_title': ('subHeading', {str}),
'description': (('lead', 'body'), {clean_html}, {lambda x: x or None}),
'description': (('lead', 'body'), {clean_html}, filter),
'timestamp': ('created', {int_or_none}),
'modified_timestamp': ('updated', {int_or_none}),
'release_timestamp': (('scheduleStart', 'publicStart'), {int_or_none}),

View File

@@ -17,6 +17,7 @@ from ..utils import (
url_or_none,
variadic,
)
from ..utils.traversal import traverse_obj
class ERTFlixBaseIE(InfoExtractor):
@@ -74,29 +75,28 @@ class ERTFlixCodenameIE(ERTFlixBaseIE):
def _extract_formats_and_subs(self, video_id):
media_info = self._call_api(video_id, codename=video_id)
formats, subs = [], {}
for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []:
for media in try_get(media_file, lambda x: x['Formats'], list) or []:
fmt_url = url_or_none(try_get(media, lambda x: x['Url']))
if not fmt_url:
continue
ext = determine_ext(fmt_url)
if ext == 'm3u8':
formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False)
elif ext == 'mpd':
formats_, subs_ = self._extract_mpd_formats_and_subtitles(
fmt_url, video_id, mpd_id='dash', fatal=False)
else:
formats.append({
'url': fmt_url,
'format_id': str_or_none(media.get('Id')),
})
continue
formats.extend(formats_)
self._merge_subtitles(subs_, target=subs)
formats, subtitles = [], {}
for media in traverse_obj(media_info, (
'MediaFiles', lambda _, v: v['RoleCodename'] == 'main',
'Formats', lambda _, v: url_or_none(v['Url']))):
fmt_url = media['Url']
ext = determine_ext(fmt_url)
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False)
elif ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(
fmt_url, video_id, mpd_id='dash', fatal=False)
else:
formats.append({
'url': fmt_url,
'format_id': str_or_none(media.get('Id')),
})
continue
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return formats, subs
return formats, subtitles
def _real_extract(self, url):
video_id = self._match_id(url)

View File

@@ -294,37 +294,37 @@ class ESPNCricInfoIE(InfoExtractor):
class WatchESPNIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'
_TESTS = [{
'url': 'https://www.espn.com/watch/player/_/id/dbbc6b1d-c084-4b47-9878-5f13c56ce309',
'url': 'https://www.espn.com/watch/player/_/id/11ce417a-6ac9-42b6-8a15-46aeb9ad5710',
'info_dict': {
'id': 'dbbc6b1d-c084-4b47-9878-5f13c56ce309',
'id': '11ce417a-6ac9-42b6-8a15-46aeb9ad5710',
'ext': 'mp4',
'title': 'Huddersfield vs. Burnley',
'duration': 7500,
'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/dbbc6b1d-c084-4b47-9878-5f13c56ce309/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs',
'title': 'Abilene Chrstn vs. Texas Tech',
'duration': 14166,
'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/11ce417a-6ac9-42b6-8a15-46aeb9ad5710/16x9.jpg?timestamp=202407252343&showBadge=true&cb=12&package=ESPN_PLUS',
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.espn.com/watch/player/_/id/a049a56e-a7ce-477e-aef3-c7e48ef8221c',
'url': 'https://www.espn.com/watch/player/_/id/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3',
'info_dict': {
'id': 'a049a56e-a7ce-477e-aef3-c7e48ef8221c',
'id': '90a2c85d-75e0-4b1e-a878-8e428a3cb2f3',
'ext': 'mp4',
'title': 'Dynamo Dresden vs. VfB Stuttgart (Round #1) (German Cup)',
'duration': 8335,
'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS',
'title': 'UC Davis vs. California',
'duration': 9547,
'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs',
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.espn.com/espnplus/player/_/id/317f5fd1-c78a-4ebe-824a-129e0d348421',
'url': 'https://www.espn.com/watch/player/_/id/c4313bbe-95b5-4bb8-b251-ac143ea0fc54',
'info_dict': {
'id': '317f5fd1-c78a-4ebe-824a-129e0d348421',
'id': 'c4313bbe-95b5-4bb8-b251-ac143ea0fc54',
'ext': 'mp4',
'title': 'The Wheel - Episode 10',
'duration': 3352,
'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/317f5fd1-c78a-4ebe-824a-129e0d348421/16x9.jpg?timestamp=202205031523&showBadge=true&cb=12&package=ESPN_PLUS',
'title': 'The College Football Show',
'duration': 3639,
'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/c4313bbe-95b5-4bb8-b251-ac143ea0fc54/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs',
},
'params': {
'skip_download': True,
@@ -353,6 +353,13 @@ class WatchESPNIE(AdobePassIE):
if not cookie:
self.raise_login_required(method='cookies')
jwt = self._search_regex(r'=([^|]+)\|', cookie.value, 'cookie jwt')
id_token = self._download_json(
'https://registerdisney.go.com/jgc/v6/client/ESPN-ONESITE.WEB-PROD/guest/refresh-auth',
None, 'Refreshing token', headers={'Content-Type': 'application/json'}, data=json.dumps({
'refreshToken': json.loads(base64.urlsafe_b64decode(f'{jwt}==='))['refresh_token'],
}).encode())['data']['token']['id_token']
assertion = self._call_bamgrid_api(
'devices', video_id,
headers={'Content-Type': 'application/json; charset=UTF-8'},
@@ -371,7 +378,7 @@ class WatchESPNIE(AdobePassIE):
})['access_token']
assertion = self._call_bamgrid_api(
'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]},
'accounts/grant', video_id, payload={'id_token': id_token},
headers={
'Authorization': token,
'Content-Type': 'application/json; charset=UTF-8',

View File

@@ -3,7 +3,12 @@ from ..utils import traverse_obj
class EurosportIE(InfoExtractor):
_VALID_URL = r'https?://www\.eurosport\.com/\w+/(?:[\w-]+/[\d-]+/)?[\w-]+_(?P<id>vid\d+)'
_VALID_URL = r'''(?x)
https?://(?:
(?:(?:www|espanol)\.)?eurosport\.(?:com(?:\.tr)?|de|dk|es|fr|hu|it|nl|no|ro)|
eurosport\.tvn24\.pl
)/[\w-]+/(?:[\w-]+/[\d-]+/)?[\w.-]+_(?P<id>vid\d+)
'''
_TESTS = [{
'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml',
'info_dict': {
@@ -70,6 +75,42 @@ class EurosportIE(InfoExtractor):
'duration': 105.0,
'upload_date': '20230518',
},
}, {
'url': 'https://www.eurosport.de/radsport/vuelta-a-espana/2024/vuelta-a-espana-2024-wout-van-aert-und-co.-verzweifeln-an-mcnulty-zeitfahr-krimi-in-lissabon_vid2219478/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.dk/speedway/mikkel-michelsen-misser-finalen-i-cardiff-se-danskeren-i-semifinalen-her_vid2219363/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.nl/mixed-martial-arts/ufc/2022/ufc-305-respect-tussen-adesanya-en-du-plessis_vid2219650/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.es/ciclismo/la-vuelta-2024-carlos-rodriguez-olvida-la-crono-y-ya-espera-que-llegue-la-montana-no-me-encontre-nada-comodo_vid2219682/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.fr/football/supercoupe-d-europe/2024-2025/kylian-mbappe-vinicius-junior-eduardo-camavinga-touche.-extraits-de-l-entrainement-du-real-madrid-en-video_vid2216993/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.it/calcio/serie-a/2024-2025/samardzic-a-bergamo-per-le-visite-mediche-con-l-atalanta_vid2219680/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.hu/kerekpar/vuelta-a-espana/2024/dramai-harc-a-masodpercekert-meglepetesgyoztes-a-vuelta-nyitoszakaszan_vid2219481/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.no/golf/fedex-st-jude-championship/2024/ligger-pa-andreplass-sa-skjer-dette-drama_vid30000618/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.no/golf/fedex-st-jude-championship/2024/ligger-pa-andreplass-sa-skjer-dette-drama_vid2219531/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.ro/tenis/western-southern-open-2/2024/rezumatul-partidei-dintre-zverev-si-shelton-de-la-cincinnati_vid2219657/video.shtml',
'only_matching': True,
}, {
'url': 'https://www.eurosport.com.tr/hentbol/olympic-games-paris-2024/2024/paris-2024-denmark-ile-germany-olimpiyatlarin-onemli-anlari_vid2215836/video.shtml',
'only_matching': True,
}, {
'url': 'https://eurosport.tvn24.pl/kolarstwo/tour-de-france-kobiet/2024/kasia-niewiadoma-przed-ostatnim-8.-etapem-tour-de-france-kobiet_vid2219765/video.shtml',
'only_matching': True,
}]
_TOKEN = None
@@ -77,6 +118,7 @@ class EurosportIE(InfoExtractor):
# actually defined in https://netsport.eurosport.io/?variables={"databaseId":<databaseId>,"playoutType":"VDP"}&extensions={"persistedQuery":{"version":1 ..
# but this method require to get sha256 hash
_GEO_COUNTRIES = ['DE', 'NL', 'EU', 'IT', 'FR'] # Not complete list but it should work
_GEO_BYPASS = False
def _real_initialize(self):
if EurosportIE._TOKEN is None:
@@ -98,13 +140,13 @@ class EurosportIE(InfoExtractor):
for stream_type in json_data['attributes']['streaming']:
if stream_type == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, ext='mp4')
traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, ext='mp4', fatal=False)
elif stream_type == 'dash':
fmts, subs = self._extract_mpd_formats_and_subtitles(
traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id)
traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, fatal=False)
elif stream_type == 'mss':
fmts, subs = self._extract_ism_formats_and_subtitles(
traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id)
traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)

View File

@@ -84,7 +84,7 @@ class FacebookIE(InfoExtractor):
'timestamp': 1692346159,
'thumbnail': r're:^https?://.*',
'uploader_id': '100063551323670',
'duration': 3132.184,
'duration': 3133.583,
'view_count': int,
'concurrent_view_count': 0,
},
@@ -112,9 +112,10 @@ class FacebookIE(InfoExtractor):
'upload_date': '20140506',
'timestamp': 1399398998,
'thumbnail': r're:^https?://.*',
'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl',
'uploader_id': 'pfbid05AzrFTXgY37tqwaSgbFTTEpCLBjjEJHkigogwGiRPtKEpAsJYJpzE94H1RxYXWEtl',
'duration': 131.03,
'concurrent_view_count': int,
'view_count': int,
},
}, {
'note': 'Video with DASH manifest',
@@ -167,7 +168,7 @@ class FacebookIE(InfoExtractor):
# have 1080P, but only up to 720p in swf params
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
'md5': 'ca63897a90c9452efee5f8c40d080e25',
'md5': '1659aa21fb3dd1585874f668e81a72c8',
'info_dict': {
'id': '10155529876156509',
'ext': 'mp4',
@@ -180,9 +181,10 @@ class FacebookIE(InfoExtractor):
'view_count': int,
'uploader_id': '100059479812265',
'concurrent_view_count': int,
'duration': 44.478,
'duration': 44.181,
},
}, {
# FIXME: unable to extract uploader, no formats found
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
@@ -241,9 +243,9 @@ class FacebookIE(InfoExtractor):
'timestamp': 1511548260,
'upload_date': '20171124',
'uploader': 'Vickie Gentry',
'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
'uploader_id': 'pfbid0FkkycT95ySNNyfCw4Cho6u5G7WbbZEcxT496Hq8rtx1K3LcTCATpR3wnyYhmyGC5l',
'thumbnail': r're:^https?://.*',
'duration': 148.435,
'duration': 148.224,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
@@ -271,7 +273,7 @@ class FacebookIE(InfoExtractor):
'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...',
'thumbnail': r're:^https?://.*',
'uploader': 'Lela Evans',
'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl',
'uploader_id': 'pfbid0swT2y7t6TAsZVBvcyeYPdhTMefGaS26mzUwML3vd1ma6ndGZKxsyS4Ssu3jitZLXl',
'upload_date': '20231228',
'timestamp': 1703804085,
'duration': 394.347,
@@ -322,7 +324,7 @@ class FacebookIE(InfoExtractor):
'upload_date': '20180523',
'uploader': 'ESL One Dota 2',
'uploader_id': '100066514874195',
'duration': 4524.212,
'duration': 4524.001,
'view_count': int,
'thumbnail': r're:^https?://.*',
'concurrent_view_count': int,
@@ -339,9 +341,9 @@ class FacebookIE(InfoExtractor):
'title': 'Josef',
'thumbnail': r're:^https?://.*',
'concurrent_view_count': int,
'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl',
'uploader_id': 'pfbid02gpfwRM2XvdEJfsERupwQiNmBiDArc38RMRYZnap372q6Vs7MtFTVy72mmFWpJBTKl',
'timestamp': 1549275572,
'duration': 3.413,
'duration': 3.283,
'uploader': 'Josef Novak',
'description': '',
'upload_date': '20190204',
@@ -396,6 +398,7 @@ class FacebookIE(InfoExtractor):
'playlist_count': 1,
'skip': 'Requires logging in',
}, {
# FIXME: Cannot parse data error
# data.event.cover_media_renderer.cover_video
'url': 'https://m.facebook.com/events/1509582499515440',
'info_dict': {
@@ -498,7 +501,8 @@ class FacebookIE(InfoExtractor):
or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name']))
or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
or get_first(post, ('node', 'actors', ..., {dict}))
or get_first(post, ('event', 'event_creator', {dict})) or {})
or get_first(post, ('event', 'event_creator', {dict}))
or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {})
uploader = uploader_data.get('name') or (
clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
or self._search_regex(
@@ -524,6 +528,11 @@ class FacebookIE(InfoExtractor):
webpage, 'view count', default=None)),
'concurrent_view_count': get_first(post, (
('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
**traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', {
'like_count': ('likers', 'count', {int}),
'comment_count': ('total_comment_count', {int}),
'repost_count': ('share_count_reduced', {parse_count}),
}), get_all=False),
}
info_json_ld = self._search_json_ld(webpage, video_id, default={})
@@ -555,11 +564,12 @@ class FacebookIE(InfoExtractor):
js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats):
dash_manifest = traverse_obj(video, 'dash_manifest', 'playlist', expected_type=str)
dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str)
if dash_manifest:
formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=video.get('dash_manifest_url')))
mpd_url=url_or_none(video.get('dash_manifest_url'))))
def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around
@@ -571,16 +581,21 @@ class FacebookIE(InfoExtractor):
# Formats larger than ~500MB will return error 403 unless chunk size is regulated
f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
def extract_relay_data(_filter):
return self._parse_json(self._search_regex(
rf'data-sjs>({{.*?{_filter}.*?}})</script>',
webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
def yield_all_relay_data(_filter):
for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})</script>', webpage):
yield self._parse_json(relay_data, video_id, fatal=False) or {}
def extract_relay_prefetched_data(_filter):
return traverse_obj(extract_relay_data(_filter), (
'require', (None, (..., ..., ..., '__bbox', 'require')),
def extract_relay_data(_filter):
return next(filter(None, yield_all_relay_data(_filter)), {})
def extract_relay_prefetched_data(_filter, target_keys=None):
path = 'data'
if target_keys is not None:
path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys))
return traverse_obj(yield_all_relay_data(_filter), (
..., 'require', (None, (..., ..., ..., '__bbox', 'require')),
lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {}
..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {}
if not video_data:
server_js_data = self._parse_json(self._search_regex([
@@ -591,7 +606,8 @@ class FacebookIE(InfoExtractor):
if not video_data:
data = extract_relay_prefetched_data(
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)')
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)',
target_keys=('video', 'event', 'nodes', 'node', 'mediaset'))
if data:
entries = []
@@ -603,12 +619,13 @@ class FacebookIE(InfoExtractor):
video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = []
q = qualities(['sd', 'hd'])
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')):
playable_url = video.get(key)
playable_url = fmt_data.get(key)
if not playable_url:
continue
if determine_ext(playable_url) == 'mpd':
@@ -620,7 +637,10 @@ class FacebookIE(InfoExtractor):
'quality': q(format_id) - 3,
'url': playable_url,
})
extract_dash_manifest(video, formats)
extract_dash_manifest(fmt_data, formats)
if not formats:
# Do not append false positive entry w/o any formats
return
automatic_captions, subtitles = {}, {}
is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool}))
@@ -923,18 +943,21 @@ class FacebookReelIE(InfoExtractor):
_TESTS = [{
'url': 'https://www.facebook.com/reel/1195289147628387',
'md5': 'f13dd37f2633595982db5ed8765474d3',
'md5': 'a53256d10fc2105441fe0c4212ed8cea',
'info_dict': {
'id': '1195289147628387',
'ext': 'mp4',
'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e',
'description': 'md5:22f03309b216ac84720183961441d8db',
'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1',
'title': r're:9\.6K views · 355 reactions .+ Let the “Slapathon” commence!! .+ LL COOL J · Mama Said Knock You Out$',
'description': r're:When your trying to help your partner .+ LL COOL J · Mama Said Knock You Out$',
'uploader': 'Beast Camp Training',
'uploader_id': '100040874179269',
'duration': 9.579,
'timestamp': 1637502609,
'upload_date': '20211121',
'thumbnail': r're:^https?://.*',
'like_count': int,
'comment_count': int,
'repost_count': int,
},
}]
@@ -954,6 +977,7 @@ class FacebookAdsIE(InfoExtractor):
'id': '899206155126718',
'ext': 'mp4',
'title': 'video by Kandao',
'description': 'md5:0822724069e3aca97cbed5dabbab282e',
'uploader': 'Kandao',
'uploader_id': '774114102743284',
'uploader_url': r're:^https?://.*',
@@ -962,6 +986,22 @@ class FacebookAdsIE(InfoExtractor):
'upload_date': '20231214',
'like_count': int,
},
}, {
# key 'watermarked_video_sd_url' missing
'url': 'https://www.facebook.com/ads/library/?id=501152689226254',
'info_dict': {
'id': '501152689226254',
'ext': 'mp4',
'title': 'video by mat.nawrocki',
'description': 'md5:02a446ace7ff8c3c37a2892922492490',
'uploader': 'mat.nawrocki',
'uploader_id': '148586968341456',
'uploader_url': r're:^https?://.*',
'timestamp': 1723452305,
'thumbnail': r're:^https?://.*',
'upload_date': '20240812',
'like_count': int,
},
}, {
'url': 'https://www.facebook.com/ads/library/?id=893637265423481',
'info_dict': {
@@ -1008,34 +1048,42 @@ class FacebookAdsIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
post_data = [self._parse_json(j, video_id, fatal=False)
for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)]
data = traverse_obj(post_data, (
..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False)
post_data = traverse_obj(
re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage), (..., {json.loads}))
data = get_first(post_data, (
'require', ..., ..., ..., '__bbox', 'require', ..., ..., ...,
'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict}))
if not data:
raise ExtractorError('Unable to extract ad data')
title = data.get('title')
if not title or title == '{{product.name}}':
title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data)
markup_id = traverse_obj(data, ('body', '__m', {str}))
markup = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'markup', lambda _, v: v[0].startswith(markup_id),
..., '__html', {clean_html}, {lambda x: not x.startswith('{{product.') and x}, any))
info_dict = traverse_obj(data, {
'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}),
info_dict = merge_dicts({
'title': title,
'description': markup or None,
}, traverse_obj(data, {
'description': ('link_description', {lambda x: x if not x.startswith('{{product.') else None}),
'uploader': ('page_name', {str}),
'uploader_id': ('page_id', {str_or_none}),
'uploader_url': ('page_profile_uri', {url_or_none}),
'timestamp': ('creation_time', {int_or_none}),
'like_count': ('page_like_count', {int_or_none}),
})
}))
entries = []
for idx, entry in enumerate(traverse_obj(
data, (('videos', 'cards'), lambda _, v: any(url_or_none(v[f]) for f in self._FORMATS_MAP))), 1,
data, (('videos', 'cards'), lambda _, v: any(url_or_none(v.get(f)) for f in self._FORMATS_MAP))), 1,
):
entries.append({
'id': f'{video_id}_{idx}',
'title': entry.get('title') or title,
'description': entry.get('link_description') or info_dict.get('description'),
'description': traverse_obj(entry, 'body', 'link_description') or info_dict.get('description'),
'thumbnail': url_or_none(entry.get('video_preview_image_url')),
'formats': self._extract_formats(entry),
})

View File

@@ -14,7 +14,7 @@ from ..utils import (
class FC2IE(InfoExtractor):
_VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P<id>[^/]+)'
_VALID_URL = r'(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P<id>[^/]+)'
IE_NAME = 'fc2'
_NETRC_MACHINE = 'fc2'
_TESTS = [{

View File

@@ -5,6 +5,7 @@ from .common import InfoExtractor
from .dailymotion import DailymotionIE
from ..networking import HEADRequest
from ..utils import (
clean_html,
determine_ext,
filter_dict,
format_field,
@@ -33,6 +34,7 @@ class FranceTVIE(InfoExtractor):
_GEO_BYPASS = False
_TESTS = [{
# tokenized url is in dinfo['video']['token']
'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1',
'info_dict': {
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
@@ -44,6 +46,19 @@ class FranceTVIE(InfoExtractor):
'upload_date': '20170813',
},
'params': {'skip_download': 'm3u8'},
}, {
# tokenized url is in dinfo['video']['token']['akamai']
'url': 'francetv:c5bda21d-2c6f-4470-8849-3d8327adb2ba',
'info_dict': {
'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba',
'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus',
'timestamp': 1514118300,
'duration': 2880,
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20171224',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'francetv:162311093',
'only_matching': True,
@@ -68,6 +83,7 @@ class FranceTVIE(InfoExtractor):
def _extract_video(self, video_id, hostname=None):
is_live = None
videos = []
drm_formats = False
title = None
subtitle = None
episode_number = None
@@ -85,13 +101,12 @@ class FranceTVIE(InfoExtractor):
'device_type': device_type,
'browser': browser,
'domain': hostname,
}), fatal=False)
}), fatal=False, expected_status=422) # 422 json gives detailed error code/message
if not dinfo:
continue
video = traverse_obj(dinfo, ('video', {dict}))
if video:
if video := traverse_obj(dinfo, ('video', {dict})):
videos.append(video)
if duration is None:
duration = video.get('duration')
@@ -99,9 +114,19 @@ class FranceTVIE(InfoExtractor):
is_live = video.get('is_live')
if spritesheets is None:
spritesheets = video.get('spritesheets')
elif code := traverse_obj(dinfo, ('code', {int})):
if code == 2009:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
elif code in (2015, 2017):
# 2015: L'accès à cette vidéo est impossible. (DRM-only)
# 2017: Cette vidéo n'est pas disponible depuis le site web mobile (b/c DRM)
drm_formats = True
continue
self.report_warning(
f'{self.IE_NAME} said: {code} "{clean_html(dinfo.get("message"))}"')
continue
meta = traverse_obj(dinfo, ('meta', {dict}))
if meta:
if meta := traverse_obj(dinfo, ('meta', {dict})):
if title is None:
title = meta.get('title')
# meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>"
@@ -114,12 +139,15 @@ class FranceTVIE(InfoExtractor):
if timestamp is None:
timestamp = parse_iso8601(meta.get('broadcasted_at'))
if not videos and drm_formats:
self.report_drm(video_id)
formats, subtitles, video_url = [], {}, None
for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])):
video_url = video['url']
format_id = video.get('format')
if token_url := url_or_none(video.get('token')):
if token_url := traverse_obj(video, ('token', (None, 'akamai'), {url_or_none}, any)):
tokenized_url = traverse_obj(self._download_json(
token_url, video_id, f'Downloading signed {format_id} manifest URL',
fatal=False, query={
@@ -225,13 +253,13 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
_TESTS = [{
'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
'info_dict': {
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba',
'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus',
'timestamp': 1502623500,
'duration': 2580,
'timestamp': 1514118300,
'duration': 2880,
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20170813',
'upload_date': '20171224',
},
'params': {
'skip_download': True,

View File

@@ -3,7 +3,7 @@ from .nexx import NexxIE
class FunkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_VALID_URL = r'https?://(?:(?:www|origin|play)\.)?funk\.net/(?:channel|playlist)/[^/?#]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821',
'md5': '8610449476156f338761a75391b0017d',
@@ -27,6 +27,9 @@ class FunkIE(InfoExtractor):
}, {
'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699',
'only_matching': True,
}, {
'url': 'https://play.funk.net/playlist/neuesteVideos/george-floyd-wenn-die-polizei-toetet-der-fall-2004391',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@@ -8,6 +8,9 @@ from .common import InfoExtractor
from .commonprotocols import RtmpIE
from .youtube import YoutubeIE
from ..compat import compat_etree_fromstring
from ..cookies import LenientSimpleCookie
from ..networking.exceptions import HTTPError
from ..networking.impersonate import ImpersonateTarget
from ..utils import (
KNOWN_EXTENSIONS,
MEDIA_EXTENSIONS,
@@ -43,6 +46,7 @@ from ..utils import (
xpath_text,
xpath_with_ns,
)
from ..utils._utils import _UnsafeExtensionError
class GenericIE(InfoExtractor):
@@ -2167,7 +2171,15 @@ class GenericIE(InfoExtractor):
urllib.parse.urlparse(fragment_query).query or fragment_query
or urllib.parse.urlparse(manifest_url).query or None)
hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None
key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
if key_query is not None:
info['extra_param_to_key_url'] = (
urllib.parse.urlparse(key_query).query or key_query
or urllib.parse.urlparse(manifest_url).query or None)
def hex_or_none(value):
return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None
info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
}) or None
@@ -2331,7 +2343,7 @@ class GenericIE(InfoExtractor):
default_search = 'fixup_error'
if default_search in ('auto', 'auto_warning', 'fixup_error'):
if re.match(r'^[^\s/]+\.[^\s/]+/', url):
if re.match(r'[^\s/]+\.[^\s/]+/', url):
self.report_warning('The url doesn\'t specify the protocol, trying with http')
return self.url_result('http://' + url)
elif default_search != 'fixup_error':
@@ -2364,6 +2376,11 @@ class GenericIE(InfoExtractor):
else:
video_id = self._generic_id(url)
# Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335
impersonate = self._configuration_arg('impersonate', ['false'])
if 'false' in impersonate:
impersonate = None
# Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
# making it impossible to download only chunk of the file (yet we need only 512kB to
# test whether it's HTML or not). According to yt-dlp default Accept-Encoding
@@ -2372,10 +2389,29 @@ class GenericIE(InfoExtractor):
# to accept raw bytes and being able to download only a chunk.
# It may probably better to solve this by checking Content-Type for application/octet-stream
# after a HEAD request, but not sure if we can rely on this.
full_response = self._request_webpage(url, video_id, headers=filter_dict({
'Accept-Encoding': 'identity',
'Referer': smuggled_data.get('referer'),
}))
try:
full_response = self._request_webpage(url, video_id, headers=filter_dict({
'Accept-Encoding': 'identity',
'Referer': smuggled_data.get('referer'),
}), impersonate=impersonate)
except ExtractorError as e:
if not (isinstance(e.cause, HTTPError) and e.cause.status == 403
and e.cause.response.get_header('cf-mitigated') == 'challenge'
and e.cause.response.extensions.get('impersonate') is None):
raise
cf_cookie_domain = traverse_obj(
LenientSimpleCookie(e.cause.response.get_header('set-cookie')),
('__cf_bm', 'domain'))
if cf_cookie_domain:
self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}')
self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm')
msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; '
if not self._downloader._impersonate_target_available(ImpersonateTarget()):
msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for '
'how to install the required impersonation dependency, and ')
raise ExtractorError(
f'{msg}try again with --extractor-args "generic:impersonate"', expected=True)
new_url = full_response.url
if new_url != extract_basic_auth(url)[0]:
self.report_following_redirect(new_url)
@@ -2391,7 +2427,7 @@ class GenericIE(InfoExtractor):
# Check for direct link to a video
content_type = full_response.headers.get('Content-Type', '').lower()
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
m = re.match(r'(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m:
self.report_detected('direct video link')
headers = filter_dict({'Referer': smuggled_data.get('referer')})
@@ -2438,9 +2474,13 @@ class GenericIE(InfoExtractor):
if not is_html(first_bytes):
self.report_warning(
'URL could be a direct video link, returning it as such.')
ext = determine_ext(url)
if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS:
ext = 'unknown_video'
info_dict.update({
'direct': True,
'url': url,
'ext': ext,
})
return info_dict

View File

@@ -0,0 +1,91 @@
from .common import InfoExtractor
from .vimeo import VimeoIE
from ..utils import (
parse_qs,
traverse_obj,
url_or_none,
)
class GermanupaIE(InfoExtractor):
IE_DESC = 'germanupa.de'
_VALID_URL = r'https?://germanupa\.de/mediathek/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://germanupa.de/mediathek/4-figma-beratung-deine-sprechstunde-fuer-figma-fragen',
'info_dict': {
'id': '909179246',
'title': 'Tutorial: #4 Figma Beratung - Deine Sprechstunde für Figma-Fragen',
'ext': 'mp4',
'uploader': 'German UPA',
'uploader_id': 'germanupa',
'thumbnail': 'https://i.vimeocdn.com/video/1792564420-7415283ccef8bf8702dab8c6b7515555ceeb7a1c11371ffcc133b8e887dbf70e-d_1280',
'uploader_url': 'https://vimeo.com/germanupa',
'duration': 3987,
},
'expected_warnings': ['Failed to parse XML: not well-formed'],
'params': {'skip_download': 'm3u8'},
}, {
'note': 'audio, uses GenericIE',
'url': 'https://germanupa.de/mediathek/live-vom-ux-festival-neuigkeiten-von-figma-jobmarkt-agenturszene-interview-zu-sustainable',
'info_dict': {
'id': '1867346676',
'title': 'Live vom UX Festival: Neuigkeiten von Figma, Jobmarkt, Agenturszene & Interview zu Sustainable UX',
'ext': 'opus',
'timestamp': 1720545088,
'upload_date': '20240709',
'duration': 3910.557,
'like_count': int,
'description': 'md5:db2aed5ff131e177a7b33901e9a8db05',
'uploader': 'German UPA',
'repost_count': int,
'genres': ['Science'],
'license': 'all-rights-reserved',
'uploader_url': 'https://soundcloud.com/user-80097677',
'uploader_id': '471579486',
'view_count': int,
'comment_count': int,
'thumbnail': 'https://i1.sndcdn.com/artworks-oCti2e9GhaZFWBqY-48ybGw-original.jpg',
},
}, {
'note': 'Nur für Mitglieder/Just for members',
'url': 'https://germanupa.de/mediathek/ux-festival-2024-usability-tests-und-ai',
'info_dict': {
'id': '986994430',
'title': 'UX Festival 2024 "Usability Tests und AI" von Lennart Weber',
'ext': 'mp4',
'release_date': '20240719',
'uploader_url': 'https://vimeo.com/germanupa',
'timestamp': 1721373980,
'license': 'by-sa',
'like_count': int,
'thumbnail': 'https://i.vimeocdn.com/video/1904187064-2a672630c30f9ad787bd390bff3f51d7506a3e8416763ba6dbf465732b165c5c-d_1280',
'duration': 2146,
'release_timestamp': 1721373980,
'uploader': 'German UPA',
'uploader_id': 'germanupa',
'upload_date': '20240719',
'comment_count': int,
},
'expected_warnings': ['Failed to parse XML: not well-formed'],
'skip': 'login required',
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
param_url = traverse_obj(
self._search_regex(
r'<iframe[^>]+data-src\s*?=\s*?([\'"])(?P<url>https://germanupa\.de/media/oembed\?url=(?:(?!\1).)+)\1',
webpage, 'embedded video', default=None, group='url'),
({parse_qs}, 'url', 0, {url_or_none}))
if not param_url:
if self._search_regex(
r'<div[^>]+class\s*?=\s*?([\'"])(?:(?!\1).)*login-wrapper(?:(?!\1).)*\1',
webpage, 'login wrapper', default=None):
self.raise_login_required('This video is only available for members')
return self.url_result(url, 'Generic') # Fall back to generic to extract audio
real_url = param_url.replace('https://vimeo.com/', 'https://player.vimeo.com/video/')
return self.url_result(VimeoIE._smuggle_referrer(real_url, url), VimeoIE, video_id)

View File

@@ -52,7 +52,7 @@ class GetCourseRuIE(InfoExtractor):
_BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})'
_VALID_URL = [
rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P<id>[^?#]+)',
rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P<id>\d+)',
rf'{_BASE_URL_RE}/(?:pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P<id>\d+)',
]
_TESTS = [{
'url': 'http://academymel.online/3video_1',

View File

@@ -5,6 +5,7 @@ from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
join_nonempty,
parse_age_limit,
remove_end,
remove_start,
@@ -287,7 +288,7 @@ class GoIE(AdobePassIE):
if mobj:
height = int(mobj.group(2))
f.update({
'format_id': (f'{format_id}-' if format_id else '') + f'{height}P',
'format_id': join_nonempty(format_id, f'{height}P'),
'width': int(mobj.group(1)),
'height': height,
})

View File

@@ -7,7 +7,7 @@ from ..utils import (
class GolemIE(InfoExtractor):
_VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/'
_VALID_URL = r'https?://video\.golem\.de/.+?/(?P<id>.+?)/'
_TEST = {
'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html',
'md5': 'c1a2c0a3c863319651c7c992c5ee29bf',

View File

@@ -0,0 +1,32 @@
from .common import InfoExtractor
from ..utils import update_url, url_or_none
from ..utils.traversal import traverse_obj
class GraspopIE(InfoExtractor):
_VALID_URL = r'https?://vod\.graspop\.be/[a-z]{2}/(?P<id>\d+)/'
_TESTS = [{
'url': 'https://vod.graspop.be/fr/101556/thy-art-is-murder-concert/',
'info_dict': {
'id': '101556',
'ext': 'mp4',
'title': 'Thy Art Is Murder',
'thumbnail': r're:https://cdn-mds\.pickx\.be/festivals/v3/global/original/.+\.jpg',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
metadata = self._download_json(
f'https://tv.proximus.be/MWC/videocenter/festivals/{video_id}/stream', video_id)
return {
'id': video_id,
'formats': self._extract_m3u8_formats(
# Downgrade manifest request to avoid incomplete certificate chain error
update_url(metadata['source']['assetUri'], scheme='http'), video_id, 'mp4'),
**traverse_obj(metadata, {
'title': ('name', {str}),
'thumbnail': ('source', 'poster', {url_or_none}),
}),
}

View File

@@ -3,6 +3,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
join_nonempty,
parse_duration,
urljoin,
xpath_element,
@@ -69,7 +70,7 @@ class HBOBaseIE(InfoExtractor):
height = format_info.get('height')
fmt = {
'url': path,
'format_id': 'http{}'.format(f'-{height}p' if height else ''),
'format_id': join_nonempty('http'. height and f'{height}p'),
'width': format_info.get('width'),
'height': height,
}

View File

@@ -44,9 +44,6 @@ class HKETVIE(InfoExtractor):
'duration': 907,
'subtitles': {},
},
'params': {
'geo_verification_proxy': '<HK proxy here>',
},
'skip': 'Geo restricted to HK',
}]

View File

@@ -13,7 +13,7 @@ from ..utils import (
class HRFernsehenIE(InfoExtractor):
IE_NAME = 'hrfernsehen'
_VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html'
_VALID_URL = r'https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html'
_TESTS = [{
'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html',
'md5': '5c4e0ba94677c516a2f65a84110fc536',

View File

@@ -8,15 +8,19 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
parse_duration,
str_or_none,
try_get,
unescapeHTML,
unified_strdate,
update_url_query,
url_or_none,
)
from ..utils.traversal import traverse_obj
class HuyaLiveIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P<id>[^/#?&]+)(?:\D|$)'
_VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?!(?:video/play/))(?P<id>[^/#?&]+)(?:\D|$)'
IE_NAME = 'huya:live'
IE_DESC = 'huya.com'
TESTS = [{
@@ -24,6 +28,7 @@ class HuyaLiveIE(InfoExtractor):
'info_dict': {
'id': '572329',
'title': str,
'ext': 'flv',
'description': str,
'is_live': True,
'view_count': int,
@@ -131,3 +136,76 @@ class HuyaLiveIE(InfoExtractor):
fm = base64.b64decode(params['fm']).decode().split('_', 1)[0]
ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']]))
return fm, ss
class HuyaVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?huya\.com/video/play/(?P<id>\d+)\.html'
IE_NAME = 'huya:video'
IE_DESC = '虎牙视频'
_TESTS = [{
'url': 'https://www.huya.com/video/play/1002412640.html',
'info_dict': {
'id': '1002412640',
'ext': 'mp4',
'title': '8月3日',
'thumbnail': r're:https?://.*\.jpg',
'duration': 14,
'uploader': '虎牙-ATS欧卡车队青木',
'uploader_id': '1564376151',
'upload_date': '20240803',
'view_count': int,
'comment_count': int,
'like_count': int,
},
},
{
'url': 'https://www.huya.com/video/play/556054543.html',
'info_dict': {
'id': '556054543',
'ext': 'mp4',
'title': '我不挑事 也不怕事',
'thumbnail': r're:https?://.*\.jpg',
'duration': 1864,
'uploader': '卡尔',
'uploader_id': '367138632',
'upload_date': '20210811',
'view_count': int,
'comment_count': int,
'like_count': int,
},
}]
def _real_extract(self, url: str):
video_id = self._match_id(url)
video_data = self._download_json(
'https://liveapi.huya.com/moment/getMomentContent', video_id,
query={'videoId': video_id})['data']['moment']['videoInfo']
formats = []
for definition in traverse_obj(video_data, ('definitions', lambda _, v: url_or_none(v['url']))):
formats.append({
'url': definition['url'],
**traverse_obj(definition, {
'format_id': ('defName', {str}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
}),
})
return {
'id': video_id,
'formats': formats,
**traverse_obj(video_data, {
'title': ('videoTitle', {str}),
'thumbnail': ('videoCover', {url_or_none}),
'duration': ('videoDuration', {parse_duration}),
'uploader': ('nickName', {str}),
'uploader_id': ('uid', {str_or_none}),
'upload_date': ('videoUploadTime', {unified_strdate}),
'view_count': ('videoPlayNum', {int_or_none}),
'comment_count': ('videoCommentNum', {int_or_none}),
'like_count': ('favorCount', {int_or_none}),
}),
}

View File

@@ -1,4 +1,3 @@
import functools
from .common import InfoExtractor
from ..utils import (
@@ -63,7 +62,7 @@ class IlPostIE(InfoExtractor):
'url': ('podcast_raw_url', {url_or_none}),
'thumbnail': ('image', {url_or_none}),
'timestamp': ('timestamp', {int_or_none}),
'duration': ('milliseconds', {functools.partial(float_or_none, scale=1000)}),
'duration': ('milliseconds', {float_or_none(scale=1000)}),
'availability': ('free', {lambda v: 'public' if v else 'subscriber_only'}),
}),
}

View File

@@ -37,7 +37,7 @@ class ImgurBaseIE(InfoExtractor):
class ImgurIE(ImgurBaseIE):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?P<id>[a-zA-Z0-9]+)'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://imgur.com/A61SaA1',
@@ -54,6 +54,22 @@ class ImgurIE(ImgurBaseIE):
'like_count': int,
'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg',
},
}, {
# Test with URL slug
'url': 'https://imgur.com/mrw-gifv-is-up-running-without-any-bugs-A61SaA1',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 'MRW gifv is up and running without any bugs',
'timestamp': 1416446068,
'upload_date': '20141120',
'dislike_count': int,
'comment_count': int,
'release_timestamp': 1416446068,
'release_date': '20141120',
'like_count': int,
'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg',
},
}, {
'url': 'https://i.imgur.com/A61SaA1.gifv',
'only_matching': True,
@@ -92,6 +108,7 @@ class ImgurIE(ImgurBaseIE):
'comment_count': int,
'release_timestamp': 1710491255,
'release_date': '20240315',
'thumbnail': 'https://i.imgur.com/zV03bd5h.jpg',
},
}]
@@ -208,7 +225,10 @@ class ImgurIE(ImgurBaseIE):
}), get_all=False),
'id': video_id,
'formats': formats,
'thumbnail': url_or_none(search('thumbnailUrl')),
'thumbnails': [{
'url': thumbnail_url,
'http_headers': {'Accept': '*/*'},
}] if (thumbnail_url := search(['thumbnailUrl', 'twitter:image', 'og:image'])) else None,
'http_headers': {'Accept': '*/*'},
}
@@ -252,17 +272,9 @@ class ImgurGalleryBaseIE(ImgurBaseIE):
class ImgurGalleryIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:gallery'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?P<id>[a-zA-Z0-9]+)'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'http://imgur.com/gallery/Q95ko',
'info_dict': {
'id': 'Q95ko',
'title': 'Adding faces make every GIF better',
},
'playlist_count': 25,
'skip': 'Zoinks! You\'ve taken a wrong turn.',
}, {
# TODO: static images - replace with animated/video gallery
'url': 'http://imgur.com/topic/Aww/ll5Vk',
'only_matching': True,
@@ -280,7 +292,27 @@ class ImgurGalleryIE(ImgurGalleryBaseIE):
'release_timestamp': 1358554297,
'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg',
'release_date': '20130119',
'uploader_url': 'https://i.imgur.com/u3R4I2S_d.png?maxwidth=290&fidelity=grand',
'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand',
'comment_count': int,
'dislike_count': int,
'like_count': int,
},
}, {
# Test with slug
'url': 'https://imgur.com/gallery/classic-steve-carell-gif-cracks-me-up-everytime-repost-downvotes-YcAQlkx',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'YcAQlkx',
'ext': 'mp4',
'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
'timestamp': 1358554297,
'upload_date': '20130119',
'uploader_id': '1648642',
'uploader': 'wittyusernamehere',
'release_timestamp': 1358554297,
'release_date': '20130119',
'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg',
'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand',
'comment_count': int,
'dislike_count': int,
'like_count': int,
@@ -317,6 +349,13 @@ class ImgurGalleryIE(ImgurGalleryBaseIE):
'title': 'Penguins !',
},
'playlist_count': 3,
}, {
'url': 'https://imgur.com/t/unmuted/penguins-penguins-6lAn9VQ',
'info_dict': {
'id': '6lAn9VQ',
'title': 'Penguins !',
},
'playlist_count': 3,
}, {
'url': 'https://imgur.com/t/unmuted/kx2uD3C',
'add_ies': ['Imgur'],
@@ -357,7 +396,7 @@ class ImgurGalleryIE(ImgurGalleryBaseIE):
class ImgurAlbumIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:album'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)'
_GALLERY = False
_TESTS = [{
# TODO: only static images - replace with animated/video gallery
@@ -372,6 +411,14 @@ class ImgurAlbumIE(ImgurGalleryBaseIE):
'title': 'enen-no-shouboutai',
},
'playlist_count': 2,
}, {
# Test with URL slug
'url': 'https://imgur.com/a/enen-no-shouboutai-iX265HX',
'info_dict': {
'id': 'iX265HX',
'title': 'enen-no-shouboutai',
},
'playlist_count': 2,
}, {
'url': 'https://imgur.com/a/8pih2Ed',
'info_dict': {

View File

@@ -48,7 +48,6 @@ class InstagramBaseIE(InfoExtractor):
'X-IG-WWW-Claim': '0',
'Origin': 'https://www.instagram.com',
'Accept': '*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
}
def _perform_login(self, username, password):
@@ -435,10 +434,10 @@ class InstagramIE(InstagramBaseIE):
'X-Requested-With': 'XMLHttpRequest',
'Referer': url,
}, query={
'query_hash': '9f8827793ef34641b2fb195d4d41151c',
'doc_id': '8845758582119845',
'variables': json.dumps(variables, separators=(',', ':')),
})
media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {})
media.update(traverse_obj(general_info, ('data', 'xdt_shortcode_media')) or {})
if not general_info:
self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id)
@@ -453,7 +452,7 @@ class InstagramIE(InstagramBaseIE):
else:
self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage (some metadata might be missing).')
webpage = self._download_webpage(
f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False)
f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) or ''
additional_data = self._search_json(
r'window\.__additionalDataLoaded\s*\(\s*[^,]+,', webpage, 'additional data', video_id, fatal=False)
if not additional_data and not media:

View File

@@ -25,9 +25,29 @@ class IPrimaIE(InfoExtractor):
'id': 'p51388',
'ext': 'mp4',
'title': 'Partička (92)',
'description': 'md5:859d53beae4609e6dd7796413f1b6cac',
'upload_date': '20201103',
'timestamp': 1604437480,
'description': 'md5:57943f6a50d6188288c3a579d2fd5f01',
'episode': 'Partička (92)',
'season': 'Partička',
'series': 'Prima Partička',
'episode_number': 92,
'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-ef6cf9de-c980-4443-92e4-17fe8bccd45c-16x9.jpeg',
},
'params': {
'skip_download': True, # m3u8 download
},
}, {
'url': 'https://zoom.iprima.cz/porady/krasy-kanarskych-ostrovu/tenerife-v-risi-ohne',
'info_dict': {
'id': 'p1412199',
'ext': 'mp4',
'episode_number': 3,
'episode': 'Tenerife: V říši ohně',
'description': 'md5:4b4a05c574b5eaef130e68d4811c3f2c',
'duration': 3111.0,
'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-f66dd7fb-c1a0-47d1-b3bc-7db328d566c5-16x9-1711636518.jpg/t_16x9_medium_1366_768',
'title': 'Tenerife: V říši ohně',
'timestamp': 1711825800,
'upload_date': '20240330',
},
'params': {
'skip_download': True, # m3u8 download
@@ -131,6 +151,7 @@ class IPrimaIE(InfoExtractor):
video_id = self._search_regex((
r'productId\s*=\s*([\'"])(?P<id>p\d+)\1',
r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1',
r'let\s+videos\s*=\s*([\'"])(?P<id>p\d+)\1',
), webpage, 'real id', group='id', default=None)
if not video_id:
@@ -176,7 +197,7 @@ class IPrimaIE(InfoExtractor):
final_result = self._search_json_ld(webpage, video_id, default={})
final_result.update({
'id': video_id,
'title': title,
'title': final_result.get('title') or title,
'thumbnail': self._html_search_meta(
['thumbnail', 'og:image', 'twitter:image'],
webpage, 'thumbnail', default=None),

View File

@@ -2,7 +2,6 @@ import functools
import hashlib
import json
import time
import urllib.error
import urllib.parse
from .common import InfoExtractor

View File

@@ -194,11 +194,14 @@ class ShugiinItvVodIE(ShugiinItvBaseIE):
class SangiinInstructionIE(InfoExtractor):
_VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
_VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
IE_DESC = False # this shouldn't be listed as a supported site
def _real_extract(self, url):
raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True)
raise ExtractorError(
'Copy the link from the button below the video description/player '
'and use that link to download. If there is no button in the frame, '
'get the URL of the frame showing the video.', expected=True)
class SangiinIE(InfoExtractor):

View File

@@ -326,11 +326,11 @@ class JioCinemaIE(JioCinemaBaseIE):
# fallback metadata
'title': ('name', {str}),
'description': ('fullSynopsis', {str}),
'series': ('show', 'name', {str}, {lambda x: x or None}),
'series': ('show', 'name', {str}, filter),
'season': ('tournamentName', {str}, {lambda x: x if x != 'Season 0' else None}),
'season_number': ('episode', 'season', {int_or_none}, {lambda x: x or None}),
'season_number': ('episode', 'season', {int_or_none}, filter),
'episode': ('fullTitle', {str}),
'episode_number': ('episode', 'episodeNo', {int_or_none}, {lambda x: x or None}),
'episode_number': ('episode', 'episodeNo', {int_or_none}, filter),
'age_limit': ('ageNemonic', {parse_age_limit}),
'duration': ('totalDuration', {float_or_none}),
'thumbnail': ('images', {url_or_none}),
@@ -338,10 +338,10 @@ class JioCinemaIE(JioCinemaBaseIE):
**traverse_obj(metadata, ('result', 0, {
'title': ('fullTitle', {str}),
'description': ('fullSynopsis', {str}),
'series': ('showName', {str}, {lambda x: x or None}),
'season': ('seasonName', {str}, {lambda x: x or None}),
'series': ('showName', {str}, filter),
'season': ('seasonName', {str}, filter),
'season_number': ('season', {int_or_none}),
'season_id': ('seasonId', {str}, {lambda x: x or None}),
'season_id': ('seasonId', {str}, filter),
'episode': ('fullTitle', {str}),
'episode_number': ('episode', {int_or_none}),
'timestamp': ('uploadTime', {int_or_none}),
@@ -364,20 +364,25 @@ class JioCinemaSeriesIE(JioCinemaBaseIE):
'title': 'naagin',
},
'playlist_mincount': 120,
}, {
'url': 'https://www.jiocinema.com/tv-shows/mtv-splitsvilla-x5/3499820',
'info_dict': {
'id': '3499820',
'title': 'mtv-splitsvilla-x5',
},
'playlist_mincount': 310,
}]
def _entries(self, series_id):
seasons = self._download_json(
f'{self._METADATA_API_BASE}/voot/v1/voot-web/content/generic/season-by-show', series_id,
'Downloading series metadata JSON', query={
'sort': 'season:asc',
'id': series_id,
'responseType': 'common',
})
seasons = traverse_obj(self._download_json(
f'{self._METADATA_API_BASE}/voot/v1/voot-web/view/show/{series_id}', series_id,
'Downloading series metadata JSON', query={'responseType': 'common'}), (
'trays', lambda _, v: v['trayId'] == 'season-by-show-multifilter',
'trayTabs', lambda _, v: v['id']))
for season_num, season in enumerate(traverse_obj(seasons, ('result', lambda _, v: v['id'])), 1):
for season_num, season in enumerate(seasons, start=1):
season_id = season['id']
label = season.get('season') or season_num
label = season.get('label') or season_num
for page_num in itertools.count(1):
episodes = traverse_obj(self._download_json(
f'{self._METADATA_API_BASE}/voot/v1/voot-web/content/generic/series-wise-episode',

View File

@@ -158,7 +158,7 @@ class JioSaavnAlbumIE(JioSaavnBaseIE):
class JioSaavnPlaylistIE(JioSaavnBaseIE):
IE_NAME = 'jiosaavn:playlist'
_VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/s/playlist/(?:[^/?#]+/){2}(?P<id>[^/?#]+)'
_VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__',
'info_dict': {
@@ -173,6 +173,13 @@ class JioSaavnPlaylistIE(JioSaavnBaseIE):
'title': 'Mood Hindi',
},
'playlist_mincount': 801,
}, {
'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_',
'info_dict': {
'id': 'Me5RridRfDk_',
'title': 'Taaza Tunes',
},
'playlist_mincount': 301,
}]
_PAGE_SIZE = 50

View File

@@ -22,7 +22,7 @@ class KalturaIE(InfoExtractor):
(?:
kaltura:(?P<partner_id>\w+):(?P<id>\w+)(?::(?P<player_type>\w+))?|
https?://
(:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/
(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/
(?:
(?:
# flash player

View File

@@ -3,43 +3,52 @@ import json
from .common import InfoExtractor
from ..utils import (
int_or_none,
make_archive_id,
parse_iso8601,
try_get,
str_or_none,
traverse_obj,
url_or_none,
urljoin,
)
class KhanAcademyBaseIE(InfoExtractor):
_VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
_PUBLISHED_CONTENT_VERSION = 'dc34750f0572c80f5effe7134082fe351143c1e4'
def _parse_video(self, video):
return {
'_type': 'url_transparent',
'url': video['youtubeId'],
'id': video.get('slug'),
'title': video.get('title'),
'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
'duration': int_or_none(video.get('duration')),
'description': video.get('description'),
'id': video['youtubeId'],
'ie_key': 'Youtube',
**traverse_obj(video, {
'display_id': ('id', {str_or_none}),
'title': ('translatedTitle', {str}),
'thumbnail': ('thumbnailUrls', ..., 'url', {url_or_none}),
'duration': ('duration', {int_or_none}),
'description': ('description', {str}),
}, get_all=False),
}
def _real_extract(self, url):
display_id = self._match_id(url)
content = self._download_json(
'https://www.khanacademy.org/api/internal/graphql/FetchContentData',
display_id, query={
'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id,
query={
'fastly_cacheable': 'persist_until_publish',
'hash': '4134764944',
'lang': 'en',
'pcv': self._PUBLISHED_CONTENT_VERSION,
'hash': '3712657851',
'variables': json.dumps({
'path': display_id,
'queryParams': 'lang=en',
'isModal': False,
'followRedirects': True,
'countryCode': 'US',
'kaLocale': 'en',
'clientPublishedContentVersion': self._PUBLISHED_CONTENT_VERSION,
}),
})['data']['contentJson']
return self._parse_component_props(self._parse_json(content, display_id)['componentProps'])
'lang': 'en',
})['data']['contentRoute']['listedPathData']
return self._parse_component_props(content, display_id)
class KhanAcademyIE(KhanAcademyBaseIE):
@@ -47,64 +56,98 @@ class KhanAcademyIE(KhanAcademyBaseIE):
_VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
_TEST = {
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
'md5': '1d5c2e70fa6aa29c38eca419f12515ce',
'info_dict': {
'id': 'FlIG3TvQCBQ',
'ext': 'mp4',
'title': 'The one-time pad',
'description': 'The perfect cipher',
'display_id': '716378217',
'duration': 176,
'uploader': 'Brit Cruise',
'uploader_id': 'khanacademy',
'uploader': 'Khan Academy',
'uploader_id': '@khanacademy',
'uploader_url': 'https://www.youtube.com/@khanacademy',
'upload_date': '20120411',
'timestamp': 1334170113,
'license': 'cc-by-nc-sa',
'live_status': 'not_live',
'channel': 'Khan Academy',
'channel_id': 'UC4a-Gbdw7vOaccHmFo40b9g',
'channel_url': 'https://www.youtube.com/channel/UC4a-Gbdw7vOaccHmFo40b9g',
'channel_is_verified': True,
'playable_in_embed': True,
'categories': ['Education'],
'creators': ['Brit Cruise'],
'tags': [],
'age_limit': 0,
'availability': 'public',
'comment_count': int,
'channel_follower_count': int,
'thumbnail': str,
'view_count': int,
'like_count': int,
'heatmap': list,
},
'add_ie': ['Youtube'],
}
def _parse_component_props(self, component_props):
video = component_props['tutorialPageData']['contentModel']
info = self._parse_video(video)
author_names = video.get('authorNames')
info.update({
'uploader': ', '.join(author_names) if author_names else None,
'timestamp': parse_iso8601(video.get('dateAdded')),
'license': video.get('kaUserLicense'),
})
return info
def _parse_component_props(self, component_props, display_id):
video = component_props['content']
return {
**self._parse_video(video),
**traverse_obj(video, {
'creators': ('authorNames', ..., {str}),
'timestamp': ('dateAdded', {parse_iso8601}),
'license': ('kaUserLicense', {str}),
}),
}
class KhanAcademyUnitIE(KhanAcademyBaseIE):
IE_NAME = 'khanacademy:unit'
_VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
_TEST = {
_VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('1,2', '')) + '/?(?:[?#&]|$)'
_TESTS = [{
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
'info_dict': {
'id': 'cryptography',
'id': 'x48c910b6',
'title': 'Cryptography',
'description': 'How have humans protected their secret messages through history? What has changed today?',
'display_id': 'computing/computer-science/cryptography',
'_old_archive_ids': ['khanacademyunit cryptography'],
},
'playlist_mincount': 31,
}
}, {
'url': 'https://www.khanacademy.org/computing/computer-science',
'info_dict': {
'id': 'x301707a0',
'title': 'Computer science theory',
'description': 'md5:4b472a4646e6cf6ec4ccb52c4062f8ba',
'display_id': 'computing/computer-science',
'_old_archive_ids': ['khanacademyunit computer-science'],
},
'playlist_mincount': 50,
}]
def _parse_component_props(self, component_props):
curation = component_props['curation']
def _parse_component_props(self, component_props, display_id):
course = component_props['course']
selected_unit = traverse_obj(course, (
'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course
entries = []
tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
for tutorial_number, tutorial in enumerate(tutorials, 1):
chapter_info = {
'chapter': tutorial.get('title'),
'chapter_number': tutorial_number,
'chapter_id': tutorial.get('id'),
}
for content_item in (tutorial.get('contentItems') or []):
if content_item.get('kind') == 'Video':
info = self._parse_video(content_item)
info.update(chapter_info)
entries.append(info)
def build_entry(entry):
return self.url_result(urljoin(
'https://www.khanacademy.org', entry['canonicalUrl']),
KhanAcademyIE, title=entry.get('translatedTitle'))
entries = traverse_obj(selected_unit, (
(('unitChildren', ...), None), 'allOrderedChildren', ..., 'curatedChildren',
lambda _, v: v['contentKind'] == 'Video' and v['canonicalUrl'], {build_entry}))
return self.playlist_result(
entries, curation.get('unit'), curation.get('title'),
curation.get('description'))
entries,
display_id=display_id,
**traverse_obj(selected_unit, {
'id': ('id', {str}),
'title': ('translatedTitle', {str}),
'description': ('translatedDescription', {str}),
'_old_archive_ids': ('slug', {str}, {lambda x: [make_archive_id(self, x)] if x else None}),
}))

View File

@@ -1,9 +1,13 @@
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
UserNotLive,
determine_ext,
float_or_none,
int_or_none,
merge_dicts,
parse_iso8601,
str_or_none,
traverse_obj,
unified_timestamp,
@@ -25,104 +29,212 @@ class KickBaseIE(InfoExtractor):
def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs):
return self._download_json(
f'https://kick.com/api/v1/{path}', display_id, note=note,
f'https://kick.com/api/{path}', display_id, note=note,
headers=merge_dicts(headers, self._API_HEADERS), impersonate=True, **kwargs)
class KickIE(KickBaseIE):
IE_NAME = 'kick:live'
_VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://kick.com/yuppy',
'url': 'https://kick.com/buddha',
'info_dict': {
'id': '6cde1-kickrp-joe-flemmingskick-info-heremust-knowmust-see21',
'id': '92722911-nopixel-40',
'ext': 'mp4',
'title': str,
'description': str,
'channel': 'yuppy',
'channel_id': '33538',
'uploader': 'Yuppy',
'uploader_id': '33793',
'upload_date': str,
'live_status': 'is_live',
'timestamp': int,
'thumbnail': r're:^https?://.*\.jpg',
'thumbnail': r're:https?://.+\.jpg',
'categories': list,
'upload_date': str,
'channel': 'buddha',
'channel_id': '32807',
'uploader': 'Buddha',
'uploader_id': '33057',
'live_status': 'is_live',
'concurrent_view_count': int,
'release_timestamp': int,
'age_limit': 18,
'release_date': str,
},
'skip': 'livestream',
'params': {'skip_download': 'livestream'},
# 'skip': 'livestream',
}, {
'url': 'https://kick.com/kmack710',
'url': 'https://kick.com/xqc',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if (KickVODIE.suitable(url) or KickClipIE.suitable(url)) else super().suitable(url)
def _real_extract(self, url):
channel = self._match_id(url)
response = self._call_api(f'channels/{channel}', channel)
response = self._call_api(f'v2/channels/{channel}', channel)
if not traverse_obj(response, 'livestream', expected_type=dict):
raise UserNotLive(video_id=channel)
return {
'id': str(traverse_obj(
response, ('livestream', ('slug', 'id')), get_all=False, default=channel)),
'formats': self._extract_m3u8_formats(
response['playback_url'], channel, 'mp4', live=True),
'title': traverse_obj(
response, ('livestream', ('session_title', 'slug')), get_all=False, default=''),
'description': traverse_obj(response, ('user', 'bio')),
'channel': channel,
'channel_id': str_or_none(traverse_obj(response, 'id', ('livestream', 'channel_id'))),
'uploader': traverse_obj(response, 'name', ('user', 'username')),
'uploader_id': str_or_none(traverse_obj(response, 'user_id', ('user', 'id'))),
'is_live': True,
'timestamp': unified_timestamp(traverse_obj(response, ('livestream', 'created_at'))),
'thumbnail': traverse_obj(
response, ('livestream', 'thumbnail', 'url'), expected_type=url_or_none),
'categories': traverse_obj(response, ('recent_categories', ..., 'name')),
'formats': self._extract_m3u8_formats(response['playback_url'], channel, 'mp4', live=True),
**traverse_obj(response, {
'id': ('livestream', 'slug', {str}),
'title': ('livestream', 'session_title', {str}),
'description': ('user', 'bio', {str}),
'channel_id': (('id', ('livestream', 'channel_id')), {int}, {str_or_none}, any),
'uploader': (('name', ('user', 'username')), {str}, any),
'uploader_id': (('user_id', ('user', 'id')), {int}, {str_or_none}, any),
'timestamp': ('livestream', 'created_at', {unified_timestamp}),
'release_timestamp': ('livestream', 'start_time', {unified_timestamp}),
'thumbnail': ('livestream', 'thumbnail', 'url', {url_or_none}),
'categories': ('recent_categories', ..., 'name', {str}),
'concurrent_view_count': ('livestream', 'viewer_count', {int_or_none}),
'age_limit': ('livestream', 'is_mature', {bool}, {lambda x: 18 if x else 0}),
}),
}
class KickVODIE(KickBaseIE):
_VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
IE_NAME = 'kick:vod'
_VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/videos/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TESTS = [{
'url': 'https://kick.com/video/58bac65b-e641-4476-a7ba-3707a35e60e3',
'url': 'https://kick.com/xqc/videos/8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea',
'md5': '3870f94153e40e7121a6e46c068b70cb',
'info_dict': {
'id': '58bac65b-e641-4476-a7ba-3707a35e60e3',
'id': '8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea',
'ext': 'mp4',
'title': '🤠REBIRTH IS BACK!!!!🤠!stake CODE JAREDFPS 🤠',
'description': 'md5:02b0c46f9b4197fb545ab09dddb85b1d',
'channel': 'jaredfps',
'channel_id': '26608',
'uploader': 'JaredFPS',
'uploader_id': '26799',
'upload_date': '20240402',
'timestamp': 1712097108,
'duration': 33859.0,
'title': '18+ #ad 🛑LIVE🛑CLICK🛑DRAMA🛑NEWS🛑STUFF🛑REACT🛑GET IN HHERE🛑BOP BOP🛑WEEEE WOOOO🛑',
'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.',
'channel': 'xqc',
'channel_id': '668',
'uploader': 'xQc',
'uploader_id': '676',
'upload_date': '20240909',
'timestamp': 1725919141,
'duration': 10155.0,
'thumbnail': r're:^https?://.*\.jpg',
'categories': ['Call of Duty: Warzone'],
'view_count': int,
'categories': ['Just Chatting'],
'age_limit': 0,
},
'params': {
'skip_download': 'm3u8',
},
'expected_warnings': [r'impersonation'],
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
response = self._call_api(f'video/{video_id}', video_id)
response = self._call_api(f'v1/video/{video_id}', video_id)
return {
'id': video_id,
'formats': self._extract_m3u8_formats(response['source'], video_id, 'mp4'),
'title': traverse_obj(
response, ('livestream', ('session_title', 'slug')), get_all=False, default=''),
'description': traverse_obj(response, ('livestream', 'channel', 'user', 'bio')),
'channel': traverse_obj(response, ('livestream', 'channel', 'slug')),
'channel_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'id'))),
'uploader': traverse_obj(response, ('livestream', 'channel', 'user', 'username')),
'uploader_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'user_id'))),
'timestamp': unified_timestamp(response.get('created_at')),
'duration': float_or_none(traverse_obj(response, ('livestream', 'duration')), scale=1000),
'thumbnail': traverse_obj(
response, ('livestream', 'thumbnail'), expected_type=url_or_none),
'categories': traverse_obj(response, ('livestream', 'categories', ..., 'name')),
**traverse_obj(response, {
'title': ('livestream', ('session_title', 'slug'), {str}, any),
'description': ('livestream', 'channel', 'user', 'bio', {str}),
'channel': ('livestream', 'channel', 'slug', {str}),
'channel_id': ('livestream', 'channel', 'id', {int}, {str_or_none}),
'uploader': ('livestream', 'channel', 'user', 'username', {str}),
'uploader_id': ('livestream', 'channel', 'user_id', {int}, {str_or_none}),
'timestamp': ('created_at', {parse_iso8601}),
'duration': ('livestream', 'duration', {float_or_none(scale=1000)}),
'thumbnail': ('livestream', 'thumbnail', {url_or_none}),
'categories': ('livestream', 'categories', ..., 'name', {str}),
'view_count': ('views', {int_or_none}),
'age_limit': ('livestream', 'is_mature', {bool}, {lambda x: 18 if x else 0}),
}),
}
class KickClipIE(KickBaseIE):
IE_NAME = 'kick:clips'
_VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+(?:/clips/|/?\?(?:[^#]+&)?clip=)(?P<id>clip_[\w-]+)'
_TESTS = [{
'url': 'https://kick.com/mxddy?clip=clip_01GYXVB5Y8PWAPWCWMSBCFB05X',
'info_dict': {
'id': 'clip_01GYXVB5Y8PWAPWCWMSBCFB05X',
'ext': 'mp4',
'title': 'Maddy detains Abd D:',
'channel': 'mxddy',
'channel_id': '133789',
'uploader': 'AbdCreates',
'uploader_id': '3309077',
'thumbnail': r're:^https?://.*\.jpeg',
'duration': 35,
'timestamp': 1682481453,
'upload_date': '20230426',
'view_count': int,
'like_count': int,
'categories': ['VALORANT'],
'age_limit': 18,
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://kick.com/destiny?clip=clip_01H9SKET879NE7N9RJRRDS98J3',
'info_dict': {
'id': 'clip_01H9SKET879NE7N9RJRRDS98J3',
'title': 'W jews',
'ext': 'mp4',
'channel': 'destiny',
'channel_id': '1772249',
'uploader': 'punished_furry',
'uploader_id': '2027722',
'duration': 49.0,
'upload_date': '20230908',
'timestamp': 1694150180,
'thumbnail': 'https://clips.kick.com/clips/j3/clip_01H9SKET879NE7N9RJRRDS98J3/thumbnail.png',
'view_count': int,
'like_count': int,
'categories': ['Just Chatting'],
'age_limit': 0,
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://kick.com/spreen/clips/clip_01J8RGZRKHXHXXKJEHGRM932A5',
'info_dict': {
'id': 'clip_01J8RGZRKHXHXXKJEHGRM932A5',
'ext': 'mp4',
'title': 'KLJASLDJKLJKASDLJKDAS',
'channel': 'spreen',
'channel_id': '5312671',
'uploader': 'AnormalBarraBaja',
'uploader_id': '26518262',
'duration': 43.0,
'upload_date': '20240927',
'timestamp': 1727399987,
'thumbnail': 'https://clips.kick.com/clips/f2/clip_01J8RGZRKHXHXXKJEHGRM932A5/thumbnail.webp',
'view_count': int,
'like_count': int,
'categories': ['Minecraft'],
'age_limit': 0,
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
clip_id = self._match_id(url)
clip = self._call_api(f'v2/clips/{clip_id}/play', clip_id)['clip']
clip_url = clip['clip_url']
if determine_ext(clip_url) == 'm3u8':
formats = self._extract_m3u8_formats(clip_url, clip_id, 'mp4')
else:
formats = [{'url': clip_url}]
return {
'id': clip_id,
'formats': formats,
**traverse_obj(clip, {
'title': ('title', {str}),
'channel': ('channel', 'slug', {str}),
'channel_id': ('channel', 'id', {int}, {str_or_none}),
'uploader': ('creator', 'username', {str}),
'uploader_id': ('creator', 'id', {int}, {str_or_none}),
'thumbnail': ('thumbnail_url', {url_or_none}),
'duration': ('duration', {float_or_none}),
'categories': ('category', 'name', {str}, all),
'timestamp': ('created_at', {parse_iso8601}),
'view_count': ('views', {int_or_none}),
'like_count': ('likes', {int_or_none}),
'age_limit': ('is_mature', {bool}, {lambda x: 18 if x else 0}),
}),
}

126
yt_dlp/extractor/kika.py Normal file
View File

@@ -0,0 +1,126 @@
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
parse_duration,
parse_iso8601,
url_or_none,
)
from ..utils.traversal import traverse_obj
class KikaIE(InfoExtractor):
IE_DESC = 'KiKA.de'
_VALID_URL = r'https?://(?:www\.)?kika\.de/[\w/-]+/videos/(?P<id>[a-z-]+\d+)'
_GEO_COUNTRIES = ['DE']
_TESTS = [{
'url': 'https://www.kika.de/logo/videos/logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100',
'md5': 'fbfc8da483719ef06f396e5e5b938c69',
'info_dict': {
'id': 'logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100',
'ext': 'mp4',
'upload_date': '20240831',
'timestamp': 1725126600,
'season_number': 2024,
'modified_date': '20240831',
'episode': 'Episode 476',
'episode_number': 476,
'season': 'Season 2024',
'duration': 634,
'title': 'logo! vom Samstag, 31. August 2024',
'modified_timestamp': 1725129983,
},
}, {
'url': 'https://www.kika.de/kaltstart/videos/video92498',
'md5': '710ece827e5055094afeb474beacb7aa',
'info_dict': {
'id': 'video92498',
'ext': 'mp4',
'title': '7. Wo ist Leo?',
'description': 'md5:fb48396a5b75068bcac1df74f1524920',
'duration': 436,
'timestamp': 1702926876,
'upload_date': '20231218',
'episode_number': 7,
'modified_date': '20240319',
'modified_timestamp': 1710880610,
'episode': 'Episode 7',
'season_number': 1,
'season': 'Season 1',
},
}, {
'url': 'https://www.kika.de/bernd-das-brot/astrobrot/videos/video90088',
'md5': 'ffd1b700d7de0a6616a1d08544c77294',
'info_dict': {
'id': 'video90088',
'ext': 'mp4',
'upload_date': '20221102',
'timestamp': 1667390580,
'duration': 197,
'modified_timestamp': 1711093771,
'episode_number': 8,
'title': 'Es ist nicht leicht, ein Astrobrot zu sein',
'modified_date': '20240322',
'description': 'md5:d3641deaf1b5515a160788b2be4159a9',
'season_number': 1,
'episode': 'Episode 8',
'season': 'Season 1',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
doc = self._download_json(f'https://www.kika.de/_next-api/proxy/v1/videos/{video_id}', video_id)
video_assets = self._download_json(doc['assets']['url'], video_id)
subtitles = {}
if ttml_resource := url_or_none(video_assets.get('videoSubtitle')):
subtitles['de'] = [{
'url': ttml_resource,
'ext': 'ttml',
}]
if webvtt_resource := url_or_none(video_assets.get('webvttUrl')):
subtitles.setdefault('de', []).append({
'url': webvtt_resource,
'ext': 'vtt',
})
return {
'id': video_id,
'formats': list(self._extract_formats(video_assets, video_id)),
'subtitles': subtitles,
**traverse_obj(doc, {
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('date', {parse_iso8601}),
'modified_timestamp': ('modificationDate', {parse_iso8601}),
'duration': ((
('durationInSeconds', {int_or_none}),
('duration', {parse_duration})), any),
'episode_number': ('episodeNumber', {int_or_none}),
'season_number': ('season', {int_or_none}),
}),
}
def _extract_formats(self, media_info, video_id):
for media in traverse_obj(media_info, ('assets', lambda _, v: url_or_none(v['url']))):
stream_url = media['url']
ext = determine_ext(stream_url)
if ext == 'm3u8':
yield from self._extract_m3u8_formats(
stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
else:
yield {
'url': stream_url,
'format_id': ext,
**traverse_obj(media, {
'width': ('frameWidth', {int_or_none}),
'height': ('frameHeight', {int_or_none}),
# NB: filesize is 0 if unknown, bitrate is -1 if unknown
'filesize': ('fileSize', {int_or_none}, filter),
'abr': ('bitrateAudio', {int_or_none}, {lambda x: None if x == -1 else x}),
'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}),
}),
}

View File

@@ -0,0 +1,114 @@
import json
from .common import InfoExtractor
from .vimeo import VimeoIE
from ..utils import (
clean_html,
extract_attributes,
get_element_html_by_id,
int_or_none,
parse_duration,
str_or_none,
unified_strdate,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
class LaracastsBaseIE(InfoExtractor):
def _get_prop_data(self, url, display_id):
webpage = self._download_webpage(url, display_id)
return traverse_obj(
get_element_html_by_id('app', webpage),
({extract_attributes}, 'data-page', {json.loads}, 'props'))
def _parse_episode(self, episode):
if not traverse_obj(episode, 'vimeoId'):
self.raise_login_required('This video is only available for subscribers.')
return self.url_result(
VimeoIE._smuggle_referrer(
f'https://player.vimeo.com/video/{episode["vimeoId"]}', 'https://laracasts.com/'),
VimeoIE, url_transparent=True,
**traverse_obj(episode, {
'id': ('id', {int}, {str_or_none}),
'webpage_url': ('path', {urljoin('https://laracasts.com')}),
'title': ('title', {clean_html}),
'season_number': ('chapter', {int_or_none}),
'episode_number': ('position', {int_or_none}),
'description': ('body', {clean_html}),
'thumbnail': ('largeThumbnail', {url_or_none}),
'duration': ('length', {int_or_none}),
'date': ('dateSegments', 'published', {unified_strdate}),
}))
class LaracastsIE(LaracastsBaseIE):
IE_NAME = 'laracasts'
_VALID_URL = r'https?://(?:www\.)?laracasts\.com/series/(?P<id>[\w-]+/episodes/\d+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://laracasts.com/series/30-days-to-learn-laravel-11/episodes/1',
'md5': 'c8f5e7b02ad0e438ef9280a08c8493dc',
'info_dict': {
'id': '922040563',
'title': 'Hello, Laravel',
'ext': 'mp4',
'duration': 519,
'date': '20240312',
'thumbnail': 'https://laracasts.s3.amazonaws.com/videos/thumbnails/youtube/30-days-to-learn-laravel-11-1.png',
'description': 'md5:ddd658bb241975871d236555657e1dd1',
'season_number': 1,
'season': 'Season 1',
'episode_number': 1,
'episode': 'Episode 1',
'uploader': 'Laracasts',
'uploader_id': 'user20182673',
'uploader_url': 'https://vimeo.com/user20182673',
},
'expected_warnings': ['Failed to parse XML'], # TODO: Remove when vimeo extractor is fixed
}]
def _real_extract(self, url):
display_id = self._match_id(url)
return self._parse_episode(self._get_prop_data(url, display_id)['lesson'])
class LaracastsPlaylistIE(LaracastsBaseIE):
IE_NAME = 'laracasts:series'
_VALID_URL = r'https?://(?:www\.)?laracasts\.com/series/(?P<id>[\w-]+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://laracasts.com/series/30-days-to-learn-laravel-11',
'info_dict': {
'title': '30 Days to Learn Laravel',
'id': '210',
'thumbnail': 'https://laracasts.s3.amazonaws.com/series/thumbnails/social-cards/30-days-to-learn-laravel-11.png?v=2',
'duration': 30600.0,
'modified_date': '20240511',
'description': 'md5:27c260a1668a450984e8f901579912dd',
'categories': ['Frameworks'],
'tags': ['Laravel'],
'display_id': '30-days-to-learn-laravel-11',
},
'playlist_count': 30,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
series = self._get_prop_data(url, display_id)['series']
metadata = {
'display_id': display_id,
**traverse_obj(series, {
'title': ('title', {str}),
'id': ('id', {int}, {str_or_none}),
'description': ('body', {clean_html}),
'thumbnail': (('large_thumbnail', 'thumbnail'), {url_or_none}, any),
'duration': ('runTime', {parse_duration}),
'categories': ('taxonomy', 'name', {str}, all, filter),
'tags': ('topics', ..., 'name', {str}),
'modified_date': ('lastUpdated', {unified_strdate}),
}),
}
return self.playlist_result(traverse_obj(
series, ('chapters', ..., 'episodes', lambda _, v: v['vimeoId'], {self._parse_episode})), **metadata)

View File

@@ -66,7 +66,7 @@ class LBRYBaseIE(InfoExtractor):
'license': ('value', 'license', {str}),
'timestamp': ('timestamp', {int_or_none}),
'release_timestamp': ('value', 'release_time', {int_or_none}),
'tags': ('value', 'tags', ..., {lambda x: x or None}),
'tags': ('value', 'tags', ..., filter),
'duration': ('value', stream_type, 'duration', {int_or_none}),
'channel': ('signing_channel', 'value', 'title', {str}),
'channel_id': ('signing_channel', 'claim_id', {str}),
@@ -136,6 +136,7 @@ class LBRYBaseIE(InfoExtractor):
class LBRYIE(LBRYBaseIE):
IE_NAME = 'lbry'
IE_DESC = 'odysee.com'
_VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'''
(?:\$/(?:download|embed)/)?
(?P<id>
@@ -364,6 +365,7 @@ class LBRYIE(LBRYBaseIE):
class LBRYChannelIE(LBRYBaseIE):
IE_NAME = 'lbry:channel'
IE_DESC = 'odysee.com channels'
_VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'(?P<id>@{LBRYBaseIE._OPT_CLAIM_ID})/?(?:[?&]|$)'
_TESTS = [{
'url': 'https://lbry.tv/@LBRYFoundation:0',
@@ -391,6 +393,7 @@ class LBRYChannelIE(LBRYBaseIE):
class LBRYPlaylistIE(LBRYBaseIE):
IE_NAME = 'lbry:playlist'
IE_DESC = 'odysee.com playlists'
_VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'\$/(?:play)?list/(?P<id>[0-9a-f-]+)'
_TESTS = [{
'url': 'https://odysee.com/$/playlist/ffef782f27486f0ac138bde8777f72ebdd0548c2',

View File

@@ -0,0 +1,72 @@
import functools
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
extract_attributes,
join_nonempty,
parse_duration,
unified_timestamp,
)
from ..utils.traversal import find_element, traverse_obj
class LearningOnScreenIE(InfoExtractor):
_VALID_URL = r'https?://learningonscreen\.ac\.uk/ondemand/index\.php/prog/(?P<id>\w+)'
_TESTS = [{
'url': 'https://learningonscreen.ac.uk/ondemand/index.php/prog/005D81B2?bcast=22757013',
'info_dict': {
'id': '005D81B2',
'ext': 'mp4',
'title': 'Planet Earth',
'duration': 3600.0,
'timestamp': 1164567600.0,
'upload_date': '20061126',
'thumbnail': 'https://stream.learningonscreen.ac.uk/trilt-cover-images/005D81B2-Planet-Earth-2006-11-26T190000Z-BBC4.jpg',
},
}]
def _real_initialize(self):
if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'):
self.raise_login_required(method='session_cookies')
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
details = traverse_obj(webpage, (
{find_element(id='programme-details', html=True)}, {
'title': ({find_element(tag='h2')}, {clean_html}),
'timestamp': (
{find_element(cls='broadcast-date')},
{functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}),
'duration': (
{find_element(cls='prog-running-time')}, {clean_html}, {parse_duration}),
}))
title = details.pop('title', None) or traverse_obj(webpage, (
{find_element(id='add-to-existing-playlist', html=True)},
{extract_attributes}, 'data-record-title', {clean_html}))
entries = self._parse_html5_media_entries(
'https://stream.learningonscreen.ac.uk', webpage, video_id, m3u8_id='hls', mpd_id='dash',
_headers={'Origin': 'https://learningonscreen.ac.uk', 'Referer': 'https://learningonscreen.ac.uk/'})
if not entries:
raise ExtractorError('No video found')
if len(entries) > 1:
duration = details.pop('duration', None)
for idx, entry in enumerate(entries, start=1):
entry.update(details)
entry['id'] = join_nonempty(video_id, idx)
entry['title'] = join_nonempty(title, idx)
return self.playlist_result(entries, video_id, title, duration=duration)
return {
**entries[0],
**details,
'id': video_id,
'title': title,
}

Some files were not shown because too many files have changed in this diff Show More